Merge tag 'for-6.8-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs updates from David Sterba: "There are no exciting changes for users, it's been mostly API conversions and some fixes or refactoring. The mount API conversion is a base for future improvements that would come with VFS. Metadata processing has been converted to folios, not yet enabling the large folios but it's one patch away once everything gets tested enough. Core changes: - convert extent buffers to folios: - direct API conversion where possible - performance can drop by a few percent on metadata heavy workloads, the folio sizes are not constant and the calculations add up in the item helpers - both regular and subpage modes - data cannot be converted yet, we need to port that to iomap and there are some other generic changes required - convert mount to the new API, should not be user visible: - options deprecated long time ago have been removed: inode_cache, recovery - the new logic that splits mount to two phases slightly changes timing of device scanning for multi-device filesystems - LSM options will now work (like for selinux) - convert delayed nodes radix tree to xarray, preserving the preload-like logic that still allows to allocate with GFP_NOFS - more validation of sysfs value of scrub_speed_max - refactor chunk map structure, reduce size and improve performance - extent map refactoring, smaller data structures, improved performance - reduce size of struct extent_io_tree, embedded in several structures - temporary pages used for compression are cached and attached to a shrinker, this may slightly improve performance - in zoned mode, remove redirty extent buffer tracking, zeros are written in case an out-of-order is detected and proper data are written to the actual write pointer - cleanups, refactoring, error message improvements, updated tests - verify and update branch name or tag - remove unwanted text" * tag 'for-6.8-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (89 commits) btrfs: pass btrfs_io_geometry into btrfs_max_io_len btrfs: pass struct btrfs_io_geometry to set_io_stripe btrfs: open code set_io_stripe for RAID56 btrfs: change block mapping to switch/case in btrfs_map_block btrfs: factor out block mapping for single profiles btrfs: factor out block mapping for RAID5/6 btrfs: reduce scope of data_stripes in btrfs_map_block btrfs: factor out block mapping for RAID10 btrfs: factor out block mapping for DUP profiles btrfs: factor out RAID1 block mapping btrfs: factor out block-mapping for RAID0 btrfs: re-introduce struct btrfs_io_geometry btrfs: factor out helper for single device IO check btrfs: migrate btrfs_repair_io_failure() to folio interfaces btrfs: migrate eb_bitmap_offset() to folio interfaces btrfs: migrate various end io functions to folios btrfs: migrate subpage code to folio interfaces btrfs: migrate get_eb_page_index() and get_eb_offset_in_page() to folios btrfs: don't double put our subpage reference in alloc_extent_buffer btrfs: cleanup metadata page pointer usage ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2024-01-10 09:27:40 -0800
committer: Linus Torvalds <torvalds@linux-foundation.org> 2024-01-10 09:27:40 -0800
commit: affc5af36bbb62073b6aaa4f4459b38937ff5331 (patch)
tree: 2267430b2cc94ce7c8e87ac362bcc28e89975db1 /fs
parent: 12958e9c4c8e93ef694c10960c78453edf21526e (diff)
parent: e94dfb7a2935cb91faca88bf7136177d1ce0dda8 (diff)
60 files changed, 3827 insertions, 3110 deletions
diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c
index 206cf1612c1d..1925a0919ca6 100644
--- a/fs/btrfs/accessors.c
+++ b/fs/btrfs/accessors.c
@@ -27,7 +27,7 @@ static bool check_setget_bounds(const struct extent_buffer *eb,
 void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb)
 {
 	token->eb = eb;
-	token->kaddr = page_address(eb->pages[0]);
+	token->kaddr = folio_address(eb->folios[0]);
 	token->offset = 0;
 }
 
@@ -50,7 +50,7 @@ void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *e
  * an offset into the extent buffer page array, cast to a specific type.  This
  * gives us all the type checking.
  *
- * The extent buffer pages stored in the array pages do not form a contiguous
+ * The extent buffer pages stored in the array folios may not form a contiguous
  * phyusical range, but the API functions assume the linear offset to the range
  * from 0 to metadata node size.
  */
@@ -60,28 +60,30 @@ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token,		\
 			       const void *ptr, unsigned long off)	\
 {									\
 	const unsigned long member_offset = (unsigned long)ptr + off;	\
-	const unsigned long idx = get_eb_page_index(member_offset);	\
-	const unsigned long oip = get_eb_offset_in_page(token->eb,	\
-							member_offset);	\
+	const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \
+	const unsigned long oil = get_eb_offset_in_folio(token->eb,	\
+							 member_offset);\
+	const int unit_size = folio_size(token->eb->folios[0]);		\
+	const int unit_shift = folio_shift(token->eb->folios[0]);	\
 	const int size = sizeof(u##bits);				\
 	u8 lebytes[sizeof(u##bits)];					\
-	const int part = PAGE_SIZE - oip;				\
+	const int part = unit_size - oil;				\
 									\
 	ASSERT(token);							\
 	ASSERT(token->kaddr);						\
 	ASSERT(check_setget_bounds(token->eb, ptr, off, size));		\
 	if (token->offset <= member_offset &&				\
-	    member_offset + size <= token->offset + PAGE_SIZE) {	\
-		return get_unaligned_le##bits(token->kaddr + oip);	\
+	    member_offset + size <= token->offset + unit_size) {	\
+		return get_unaligned_le##bits(token->kaddr + oil);	\
 	}								\
-	token->kaddr = page_address(token->eb->pages[idx]);		\
-	token->offset = idx << PAGE_SHIFT;				\
-	if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE ) \
-		return get_unaligned_le##bits(token->kaddr + oip);	\
+	token->kaddr = folio_address(token->eb->folios[idx]);		\
+	token->offset = idx << unit_shift;				\
+	if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \
+		return get_unaligned_le##bits(token->kaddr + oil);	\
 									\
-	memcpy(lebytes, token->kaddr + oip, part);			\
-	token->kaddr = page_address(token->eb->pages[idx + 1]);		\
-	token->offset = (idx + 1) << PAGE_SHIFT;			\
+	memcpy(lebytes, token->kaddr + oil, part);			\
+	token->kaddr = folio_address(token->eb->folios[idx + 1]);	\
+	token->offset = (idx + 1) << unit_shift;			\
 	memcpy(lebytes + part, token->kaddr, size - part);		\
 	return get_unaligned_le##bits(lebytes);				\
 }									\
@@ -89,19 +91,21 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb,		\
 			 const void *ptr, unsigned long off)		\
 {									\
 	const unsigned long member_offset = (unsigned long)ptr + off;	\
-	const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \
-	const unsigned long idx = get_eb_page_index(member_offset);	\
-	char *kaddr = page_address(eb->pages[idx]);			\
+	const unsigned long idx = get_eb_folio_index(eb, member_offset);\
+	const unsigned long oil = get_eb_offset_in_folio(eb,		\
+							 member_offset);\
+	const int unit_size = folio_size(eb->folios[0]);		\
+	char *kaddr = folio_address(eb->folios[idx]);			\
 	const int size = sizeof(u##bits);				\
-	const int part = PAGE_SIZE - oip;				\
+	const int part = unit_size - oil;				\
 	u8 lebytes[sizeof(u##bits)];					\
 									\
 	ASSERT(check_setget_bounds(eb, ptr, off, size));		\
-	if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE)	\
-		return get_unaligned_le##bits(kaddr + oip);		\
+	if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size)	\
+		return get_unaligned_le##bits(kaddr + oil);		\
 									\
-	memcpy(lebytes, kaddr + oip, part);				\
-	kaddr = page_address(eb->pages[idx + 1]);			\
+	memcpy(lebytes, kaddr + oil, part);				\
+	kaddr = folio_address(eb->folios[idx + 1]);			\
 	memcpy(lebytes + part, kaddr, size - part);			\
 	return get_unaligned_le##bits(lebytes);				\
 }									\
@@ -110,53 +114,59 @@ void btrfs_set_token_##bits(struct btrfs_map_token *token,		\
 			    u##bits val)				\
 {									\
 	const unsigned long member_offset = (unsigned long)ptr + off;	\
-	const unsigned long idx = get_eb_page_index(member_offset);	\
-	const unsigned long oip = get_eb_offset_in_page(token->eb,	\
-							member_offset);	\
+	const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \
+	const unsigned long oil = get_eb_offset_in_folio(token->eb,	\
+							 member_offset);\
+	const int unit_size = folio_size(token->eb->folios[0]);		\
+	const int unit_shift = folio_shift(token->eb->folios[0]);	\
 	const int size = sizeof(u##bits);				\
 	u8 lebytes[sizeof(u##bits)];					\
-	const int part = PAGE_SIZE - oip;				\
+	const int part = unit_size - oil;				\
 									\
 	ASSERT(token);							\
 	ASSERT(token->kaddr);						\
 	ASSERT(check_setget_bounds(token->eb, ptr, off, size));		\
 	if (token->offset <= member_offset &&				\
-	    member_offset + size <= token->offset + PAGE_SIZE) {	\
-		put_unaligned_le##bits(val, token->kaddr + oip);	\
+	    member_offset + size <= token->offset + unit_size) {	\
+		put_unaligned_le##bits(val, token->kaddr + oil);	\
 		return;							\
 	}								\
-	token->kaddr = page_address(token->eb->pages[idx]);		\
-	token->offset = idx << PAGE_SHIFT;				\
-	if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \
-		put_unaligned_le##bits(val, token->kaddr + oip);	\
+	token->kaddr = folio_address(token->eb->folios[idx]);		\
+	token->offset = idx << unit_shift;				\
+	if (INLINE_EXTENT_BUFFER_PAGES == 1 ||				\
+	    oil + size <= unit_size) {					\
+		put_unaligned_le##bits(val, token->kaddr + oil);	\
 		return;							\
 	}								\
 	put_unaligned_le##bits(val, lebytes);				\
-	memcpy(token->kaddr + oip, lebytes, part);			\
-	token->kaddr = page_address(token->eb->pages[idx + 1]);		\
-	token->offset = (idx + 1) << PAGE_SHIFT;			\
+	memcpy(token->kaddr + oil, lebytes, part);			\
+	token->kaddr = folio_address(token->eb->folios[idx + 1]);	\
+	token->offset = (idx + 1) << unit_shift;			\
 	memcpy(token->kaddr, lebytes + part, size - part);		\
 }									\
 void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr,	\
 		      unsigned long off, u##bits val)			\
 {									\
 	const unsigned long member_offset = (unsigned long)ptr + off;	\
-	const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \
-	const unsigned long idx = get_eb_page_index(member_offset);	\
-	char *kaddr = page_address(eb->pages[idx]);			\
+	const unsigned long idx = get_eb_folio_index(eb, member_offset);\
+	const unsigned long oil = get_eb_offset_in_folio(eb,		\
+							 member_offset);\
+	const int unit_size = folio_size(eb->folios[0]);		\
+	char *kaddr = folio_address(eb->folios[idx]);			\
 	const int size = sizeof(u##bits);				\
-	const int part = PAGE_SIZE - oip;				\
+	const int part = unit_size - oil;				\
 	u8 lebytes[sizeof(u##bits)];					\
 									\
 	ASSERT(check_setget_bounds(eb, ptr, off, size));		\
-	if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \
-		put_unaligned_le##bits(val, kaddr + oip);		\
+	if (INLINE_EXTENT_BUFFER_PAGES == 1 ||				\
+	    oil + size <= unit_size) {					\
+		put_unaligned_le##bits(val, kaddr + oil);		\
 		return;							\
 	}								\
 									\
 	put_unaligned_le##bits(val, lebytes);				\
-	memcpy(kaddr + oip, lebytes, part);				\
-	kaddr = page_address(eb->pages[idx + 1]);			\
+	memcpy(kaddr + oil, lebytes, part);				\
+	kaddr = folio_address(eb->folios[idx + 1]);			\
 	memcpy(kaddr, lebytes + part, size - part);			\
 }
 
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index aa0844535644..ed7aa32972ad 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -90,14 +90,14 @@ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
 #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits)		\
 static inline u##bits btrfs_##name(const struct extent_buffer *eb)	\
 {									\
-	const type *p = page_address(eb->pages[0]) +			\
+	const type *p = folio_address(eb->folios[0]) +			\
 			offset_in_page(eb->start);			\
 	return get_unaligned_le##bits(&p->member);			\
 }									\
 static inline void btrfs_set_##name(const struct extent_buffer *eb,	\
 				    u##bits val)			\
 {									\
-	type *p = page_address(eb->pages[0]) + offset_in_page(eb->start); \
+	type *p = folio_address(eb->folios[0]) + offset_in_page(eb->start); \
 	put_unaligned_le##bits(val, &p->member);			\
 }
 
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 4f3b693a16b1..928f512cdb4a 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -194,6 +194,12 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
 	struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio);
 	int mirror = repair_bbio->mirror_num;
 
+	/*
+	 * We can only trigger this for data bio, which doesn't support larger
+	 * folios yet.
+	 */
+	ASSERT(folio_order(page_folio(bv->bv_page)) == 0);
+
 	if (repair_bbio->bio.bi_status ||
 	    !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) {
 		bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ);
@@ -215,7 +221,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
 		btrfs_repair_io_failure(fs_info, btrfs_ino(inode),
 				  repair_bbio->file_offset, fs_info->sectorsize,
 				  repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT,
-				  bv->bv_page, bv->bv_offset, mirror);
+				  page_folio(bv->bv_page), bv->bv_offset, mirror);
 	} while (mirror != fbio->bbio->mirror_num);
 
 done:
@@ -626,7 +632,7 @@ static bool should_async_write(struct btrfs_bio *bbio)
 /*
  * Submit bio to an async queue.
  *
- * Return true if the work has been succesfuly submitted, else false.
+ * Return true if the work has been successfully submitted, else false.
  */
 static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
 				struct btrfs_io_context *bioc,
@@ -767,8 +773,8 @@ void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num)
  * freeing the bio.
  */
 int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
-			    u64 length, u64 logical, struct page *page,
-			    unsigned int pg_offset, int mirror_num)
+			    u64 length, u64 logical, struct folio *folio,
+			    unsigned int folio_offset, int mirror_num)
 {
 	struct btrfs_io_stripe smap = { 0 };
 	struct bio_vec bvec;
@@ -799,7 +805,8 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
 
 	bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
 	bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
-	__bio_add_page(&bio, page, length, pg_offset);
+	ret = bio_add_folio(&bio, folio, length, folio_offset);
+	ASSERT(ret);
 	ret = submit_bio_wait(&bio);
 	if (ret) {
 		/* try to remap that extent elsewhere? */
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
index ca79decee060..bbaed317161a 100644
--- a/fs/btrfs/bio.h
+++ b/fs/btrfs/bio.h
@@ -105,7 +105,7 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status);
 void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num);
 void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace);
 int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
-			    u64 length, u64 logical, struct page *page,
-			    unsigned int pg_offset, int mirror_num);
+			    u64 length, u64 logical, struct folio *folio,
+			    unsigned int folio_offset, int mirror_num);
 
 #endif
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 6e5dc68ff661..a9be9ac99222 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -168,7 +168,7 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
 						  cache);
 
 		kfree(cache->free_space_ctl);
-		kfree(cache->physical_map);
+		btrfs_free_chunk_map(cache->physical_map);
 		kfree(cache);
 	}
 }
@@ -1047,7 +1047,7 @@ static int remove_block_group_item(struct btrfs_trans_handle *trans,
 }
 
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
-			     u64 group_start, struct extent_map *em)
+			     struct btrfs_chunk_map *map)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_path *path;
@@ -1059,10 +1059,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	int index;
 	int factor;
 	struct btrfs_caching_control *caching_ctl = NULL;
-	bool remove_em;
+	bool remove_map;
 	bool remove_rsv = false;
 
-	block_group = btrfs_lookup_block_group(fs_info, group_start);
+	block_group = btrfs_lookup_block_group(fs_info, map->start);
 	BUG_ON(!block_group);
 	BUG_ON(!block_group->ro);
 
@@ -1252,7 +1252,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	 * entries because we already removed them all when we called
 	 * btrfs_remove_free_space_cache().
 	 *
-	 * And we must not remove the extent map from the fs_info->mapping_tree
+	 * And we must not remove the chunk map from the fs_info->mapping_tree
 	 * to prevent the same logical address range and physical device space
 	 * ranges from being reused for a new block group. This is needed to
 	 * avoid races with trimming and scrub.
@@ -1268,19 +1268,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	 * in place until the extents have been discarded completely when
 	 * the transaction commit has completed.
 	 */
-	remove_em = (atomic_read(&block_group->frozen) == 0);
+	remove_map = (atomic_read(&block_group->frozen) == 0);
 	spin_unlock(&block_group->lock);
 
-	if (remove_em) {
-		struct extent_map_tree *em_tree;
-
-		em_tree = &fs_info->mapping_tree;
-		write_lock(&em_tree->lock);
-		remove_extent_mapping(em_tree, em);
-		write_unlock(&em_tree->lock);
-		/* once for the tree */
-		free_extent_map(em);
-	}
+	if (remove_map)
+		btrfs_remove_chunk_map(fs_info, map);
 
 out:
 	/* Once for the lookup reference */
@@ -1295,15 +1287,12 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
 		struct btrfs_fs_info *fs_info, const u64 chunk_offset)
 {
 	struct btrfs_root *root = btrfs_block_group_root(fs_info);
-	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
-	struct extent_map *em;
-	struct map_lookup *map;
+	struct btrfs_chunk_map *map;
 	unsigned int num_items;
 
-	read_lock(&em_tree->lock);
-	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
-	read_unlock(&em_tree->lock);
-	ASSERT(em && em->start == chunk_offset);
+	map = btrfs_find_chunk_map(fs_info, chunk_offset, 1);
+	ASSERT(map != NULL);
+	ASSERT(map->start == chunk_offset);
 
 	/*
 	 * We need to reserve 3 + N units from the metadata space info in order
@@ -1324,9 +1313,8 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
 	 * more device items and remove one chunk item), but this is done at
 	 * btrfs_remove_chunk() through a call to check_system_chunk().
 	 */
-	map = em->map_lookup;
 	num_items = 3 + map->num_stripes;
-	free_extent_map(em);
+	btrfs_free_chunk_map(map);
 
 	return btrfs_start_transaction_fallback_global_rsv(root, num_items);
 }
@@ -1927,8 +1915,7 @@ void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
 static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
 			   struct btrfs_path *path)
 {
-	struct extent_map_tree *em_tree;
-	struct extent_map *em;
+	struct btrfs_chunk_map *map;
 	struct btrfs_block_group_item bg;
 	struct extent_buffer *leaf;
 	int slot;
@@ -1938,23 +1925,20 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
 	slot = path->slots[0];
 	leaf = path->nodes[0];
 
-	em_tree = &fs_info->mapping_tree;
-	read_lock(&em_tree->lock);
-	em = lookup_extent_mapping(em_tree, key->objectid, key->offset);
-	read_unlock(&em_tree->lock);
-	if (!em) {
+	map = btrfs_find_chunk_map(fs_info, key->objectid, key->offset);
+	if (!map) {
 		btrfs_err(fs_info,
 			  "logical %llu len %llu found bg but no related chunk",
 			  key->objectid, key->offset);
 		return -ENOENT;
 	}
 
-	if (em->start != key->objectid || em->len != key->offset) {
+	if (map->start != key->objectid || map->chunk_len != key->offset) {
 		btrfs_err(fs_info,
 			"block group %llu len %llu mismatch with chunk %llu len %llu",
-			key->objectid, key->offset, em->start, em->len);
+			  key->objectid, key->offset, map->start, map->chunk_len);
 		ret = -EUCLEAN;
-		goto out_free_em;
+		goto out_free_map;
 	}
 
 	read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
@@ -1962,16 +1946,16 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
 	flags = btrfs_stack_block_group_flags(&bg) &
 		BTRFS_BLOCK_GROUP_TYPE_MASK;
 
-	if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+	if (flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
 		btrfs_err(fs_info,
 "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
 			  key->objectid, key->offset, flags,
-			  (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type));
+			  (BTRFS_BLOCK_GROUP_TYPE_MASK & map->type));
 		ret = -EUCLEAN;
 	}
 
-out_free_em:
-	free_extent_map(em);
+out_free_map:
+	btrfs_free_chunk_map(map);
 	return ret;
 }
 
@@ -2024,8 +2008,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
 		     u64 physical, u64 **logical, int *naddrs, int *stripe_len)
 {
-	struct extent_map *em;
-	struct map_lookup *map;
+	struct btrfs_chunk_map *map;
 	u64 *buf;
 	u64 bytenr;
 	u64 data_stripe_length;
@@ -2033,14 +2016,13 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
 	int i, nr = 0;
 	int ret = 0;
 
-	em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
-	if (IS_ERR(em))
+	map = btrfs_get_chunk_map(fs_info, chunk_start, 1);
+	if (IS_ERR(map))
 		return -EIO;
 
-	map = em->map_lookup;
-	data_stripe_length = em->orig_block_len;
+	data_stripe_length = map->stripe_size;
 	io_stripe_size = BTRFS_STRIPE_LEN;
-	chunk_start = em->start;
+	chunk_start = map->start;
 
 	/* For RAID5/6 adjust to a full IO stripe length */
 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
@@ -2094,7 +2076,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
 	*naddrs = nr;
 	*stripe_len = io_stripe_size;
 out:
-	free_extent_map(em);
+	btrfs_free_chunk_map(map);
 	return ret;
 }
 
@@ -2199,49 +2181,47 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
  */
 static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
 {
-	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
-	struct extent_map *em;
-	struct btrfs_block_group *bg;
 	u64 start = 0;
 	int ret = 0;
 
 	while (1) {
-		read_lock(&map_tree->lock);
+		struct btrfs_chunk_map *map;
+		struct btrfs_block_group *bg;
+
 		/*
-		 * lookup_extent_mapping will return the first extent map
-		 * intersecting the range, so setting @len to 1 is enough to
+		 * btrfs_find_chunk_map() will return the first chunk map
+		 * intersecting the range, so setting @length to 1 is enough to
 		 * get the first chunk.
 		 */
-		em = lookup_extent_mapping(map_tree, start, 1);
-		read_unlock(&map_tree->lock);
-		if (!em)
+		map = btrfs_find_chunk_map(fs_info, start, 1);
+		if (!map)
 			break;
 
-		bg = btrfs_lookup_block_group(fs_info, em->start);
+		bg = btrfs_lookup_block_group(fs_info, map->start);
 		if (!bg) {
 			btrfs_err(fs_info,
 	"chunk start=%llu len=%llu doesn't have corresponding block group",
-				     em->start, em->len);
+				     map->start, map->chunk_len);
 			ret = -EUCLEAN;
-			free_extent_map(em);
+			btrfs_free_chunk_map(map);
 			break;
 		}
-		if (bg->start != em->start || bg->length != em->len ||
+		if (bg->start != map->start || bg->length != map->chunk_len ||
 		    (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
-		    (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+		    (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
 			btrfs_err(fs_info,
 "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
-				em->start, em->len,
-				em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
+				map->start, map->chunk_len,
+				map->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
 				bg->start, bg->length,
 				bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
 			ret = -EUCLEAN;
-			free_extent_map(em);
+			btrfs_free_chunk_map(map);
 			btrfs_put_block_group(bg);
 			break;
 		}
-		start = em->start + em->len;
-		free_extent_map(em);
+		start = map->start + map->chunk_len;
+		btrfs_free_chunk_map(map);
 		btrfs_put_block_group(bg);
 	}
 	return ret;
@@ -2369,28 +2349,25 @@ error:
 
 static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
 {
-	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
 	struct rb_node *node;
 	int ret = 0;
 
-	for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
-		struct extent_map *em;
-		struct map_lookup *map;
+	for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) {
+		struct btrfs_chunk_map *map;
 		struct btrfs_block_group *bg;
 
-		em = rb_entry(node, struct extent_map, rb_node);
-		map = em->map_lookup;
-		bg = btrfs_create_block_group_cache(fs_info, em->start);
+		map = rb_entry(node, struct btrfs_chunk_map, rb_node);
+		bg = btrfs_create_block_group_cache(fs_info, map->start);
 		if (!bg) {
 			ret = -ENOMEM;
 			break;
 		}
 
 		/* Fill dummy cache as FULL */
-		bg->length = em->len;
+		bg->length = map->chunk_len;
 		bg->flags = map->type;
 		bg->cached = BTRFS_CACHE_FINISHED;
-		bg->used = em->len;
+		bg->used = map->chunk_len;
 		bg->flags = map->type;
 		ret = btrfs_add_block_group_cache(fs_info, bg);
 		/*
@@ -2618,19 +2595,14 @@ static int insert_dev_extents(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_device *device;
-	struct extent_map *em;
-	struct map_lookup *map;
+	struct btrfs_chunk_map *map;
 	u64 dev_offset;
-	u64 stripe_size;
 	int i;
 	int ret = 0;
 
-	em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
-	if (IS_ERR(em))
-		return PTR_ERR(em);
-
-	map = em->map_lookup;
-	stripe_size = em->orig_block_len;
+	map = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
 
 	/*
 	 * Take the device list mutex to prevent races with the final phase of
@@ -2647,13 +2619,13 @@ static int insert_dev_extents(struct btrfs_trans_handle *trans,
 		dev_offset = map->stripes[i].physical;
 
 		ret = insert_dev_extent(trans, device, chunk_offset, dev_offset,
-				       stripe_size);
+					map->stripe_size);
 		if (ret)
 			break;
 	}
 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 
-	free_extent_map(em);
+	btrfs_free_chunk_map(map);
 	return ret;
 }
 
@@ -2910,7 +2882,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
 		goto unlock_out;
 
 	/*
-	 * Skip chunk alloction if the bg is SYSTEM, this is to avoid system
+	 * Skip chunk allocation if the bg is SYSTEM, this is to avoid system
 	 * chunk allocation storm to exhaust the system chunk array.  Otherwise
 	 * we still want to try our best to mark the block group read-only.
 	 */
@@ -4406,8 +4378,6 @@ void btrfs_freeze_block_group(struct btrfs_block_group *cache)
 void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
 {
 	struct btrfs_fs_info *fs_info = block_group->fs_info;
-	struct extent_map_tree *em_tree;
-	struct extent_map *em;
 	bool cleanup;
 
 	spin_lock(&block_group->lock);
@@ -4416,17 +4386,16 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
 	spin_unlock(&block_group->lock);
 
 	if (cleanup) {
-		em_tree = &fs_info->mapping_tree;
-		write_lock(&em_tree->lock);
-		em = lookup_extent_mapping(em_tree, block_group->start,
-					   1);
-		BUG_ON(!em); /* logic error, can't happen */
-		remove_extent_mapping(em_tree, em);
-		write_unlock(&em_tree->lock);
-
-		/* once for us and once for the tree */
-		free_extent_map(em);
-		free_extent_map(em);
+		struct btrfs_chunk_map *map;
+
+		map = btrfs_find_chunk_map(fs_info, block_group->start, 1);
+		/* Logic error, can't happen. */
+		ASSERT(map);
+
+		btrfs_remove_chunk_map(fs_info, map);
+
+		/* Once for our lookup reference. */
+		btrfs_free_chunk_map(map);
 
 		/*
 		 * We may have left one free space entry and other possible
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 2bdbcb834f95..c4a1f01cc1c2 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -5,6 +5,8 @@
 
 #include "free-space-cache.h"
 
+struct btrfs_chunk_map;
+
 enum btrfs_disk_cache_state {
 	BTRFS_DC_WRITTEN,
 	BTRFS_DC_ERROR,
@@ -243,7 +245,7 @@ struct btrfs_block_group {
 	u64 zone_unusable;
 	u64 zone_capacity;
 	u64 meta_write_pointer;
-	struct map_lookup *physical_map;
+	struct btrfs_chunk_map *physical_map;
 	struct list_head active_bg_list;
 	struct work_struct zone_finish_work;
 	struct extent_buffer *last_eb;
@@ -297,7 +299,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
 				struct btrfs_fs_info *fs_info,
 				const u64 chunk_offset);
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
-			     u64 group_start, struct extent_map *em);
+			     struct btrfs_chunk_map *map);
 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
 void btrfs_mark_bg_unused(struct btrfs_block_group *bg);
 void btrfs_reclaim_bgs_work(struct work_struct *work);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 5572ae52444e..7f7c5a92d2b8 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -69,6 +69,8 @@ enum {
 	BTRFS_INODE_VERITY_IN_PROGRESS,
 	/* Set when this inode is a free space inode. */
 	BTRFS_INODE_FREE_SPACE_INODE,
+	/* Set when there are no capabilities in XATTs for the inode. */
+	BTRFS_INODE_NO_CAP_XATTR,
 };
 
 /* in memory btrfs inode */
@@ -107,9 +109,11 @@ struct btrfs_inode {
 
 	/*
 	 * Keep track of where the inode has extent items mapped in order to
-	 * make sure the i_size adjustments are accurate
+	 * make sure the i_size adjustments are accurate. Not required when the
+	 * filesystem is NO_HOLES, the status can't be set while mounted as
+	 * it's a mkfs-time feature.
 	 */
-	struct extent_io_tree file_extent_tree;
+	struct extent_io_tree *file_extent_tree;
 
 	/* held while logging the inode in tree-log.c */
 	struct mutex log_mutex;
@@ -487,7 +491,7 @@ struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
 struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root);
 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
 				    struct page *page, size_t pg_offset,
-				    u64 start, u64 end);
+				    u64 start, u64 len);
 int btrfs_update_inode(struct btrfs_trans_handle *trans,
 		       struct btrfs_inode *inode);
 int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 19b22b4653c8..193168214eeb 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -20,6 +20,7 @@
 #include <linux/slab.h>
 #include <linux/sched/mm.h>
 #include <linux/log2.h>
+#include <linux/shrinker.h>
 #include <crypto/hash.h>
 #include "misc.h"
 #include "ctree.h"
@@ -163,13 +164,107 @@ static int compression_decompress(int type, struct list_head *ws,
 static void btrfs_free_compressed_pages(struct compressed_bio *cb)
 {
 	for (unsigned int i = 0; i < cb->nr_pages; i++)
-		put_page(cb->compressed_pages[i]);
+		btrfs_free_compr_page(cb->compressed_pages[i]);
 	kfree(cb->compressed_pages);
 }
 
 static int btrfs_decompress_bio(struct compressed_bio *cb);
 
-static void end_compressed_bio_read(struct btrfs_bio *bbio)
+/*
+ * Global cache of last unused pages for compression/decompression.
+ */
+static struct btrfs_compr_pool {
+	struct shrinker *shrinker;
+	spinlock_t lock;
+	struct list_head list;
+	int count;
+	int thresh;
+} compr_pool;
+
+static unsigned long btrfs_compr_pool_count(struct shrinker *sh, struct shrink_control *sc)
+{
+	int ret;
+
+	/*
+	 * We must not read the values more than once if 'ret' gets expanded in
+	 * the return statement so we don't accidentally return a negative
+	 * number, even if the first condition finds it positive.
+	 */
+	ret = READ_ONCE(compr_pool.count) - READ_ONCE(compr_pool.thresh);
+
+	return ret > 0 ? ret : 0;
+}
+
+static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_control *sc)
+{
+	struct list_head remove;
+	struct list_head *tmp, *next;
+	int freed;
+
+	if (compr_pool.count == 0)
+		return SHRINK_STOP;
+
+	INIT_LIST_HEAD(&remove);
+
+	/* For now, just simply drain the whole list. */
+	spin_lock(&compr_pool.lock);
+	list_splice_init(&compr_pool.list, &remove);
+	freed = compr_pool.count;
+	compr_pool.count = 0;
+	spin_unlock(&compr_pool.lock);
+
+	list_for_each_safe(tmp, next, &remove) {
+		struct page *page = list_entry(tmp, struct page, lru);
+
+		ASSERT(page_ref_count(page) == 1);
+		put_page(page);
+	}
+
+	return freed;
+}
+
+/*
+ * Common wrappers for page allocation from compression wrappers
+ */
+struct page *btrfs_alloc_compr_page(void)
+{
+	struct page *page = NULL;
+
+	spin_lock(&compr_pool.lock);
+	if (compr_pool.count > 0) {
+		page = list_first_entry(&compr_pool.list, struct page, lru);
+		list_del_init(&page->lru);
+		compr_pool.count--;
+	}
+	spin_unlock(&compr_pool.lock);
+
+	if (page)
+		return page;
+
+	return alloc_page(GFP_NOFS);
+}
+
+void btrfs_free_compr_page(struct page *page)
+{
+	bool do_free = false;
+
+	spin_lock(&compr_pool.lock);
+	if (compr_pool.count > compr_pool.thresh) {
+		do_free = true;
+	} else {
+		list_add(&page->lru, &compr_pool.list);
+		compr_pool.count++;
+	}
+	spin_unlock(&compr_pool.lock);
+
+	if (!do_free)
+		return;
+
+	ASSERT(page_ref_count(page) == 1);
+	put_page(page);
+}
+
+static void end_bbio_comprssed_read(struct btrfs_bio *bbio)
 {
 	struct compressed_bio *cb = to_compressed_bio(bbio);
 	blk_status_t status = bbio->bio.bi_status;
@@ -211,8 +306,8 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb)
 		for (i = 0; i < ret; i++) {
 			struct folio *folio = fbatch.folios[i];
 
-			btrfs_page_clamp_clear_writeback(fs_info, &folio->page,
-							 cb->start, cb->len);
+			btrfs_folio_clamp_clear_writeback(fs_info, folio,
+							  cb->start, cb->len);
 		}
 		folio_batch_release(&fbatch);
 	}
@@ -242,7 +337,7 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work)
  * This also calls the writeback end hooks for the file pages so that metadata
  * and checksums can be updated in the file.
  */
-static void end_compressed_bio_write(struct btrfs_bio *bbio)
+static void end_bbio_comprssed_write(struct btrfs_bio *bbio)
 {
 	struct compressed_bio *cb = to_compressed_bio(bbio);
 	struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
@@ -289,7 +384,7 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
 
 	cb = alloc_compressed_bio(inode, ordered->file_offset,
 				  REQ_OP_WRITE | write_flags,
-				  end_compressed_bio_write);
+				  end_bbio_comprssed_write);
 	cb->start = ordered->file_offset;
 	cb->len = ordered->num_bytes;
 	cb->compressed_pages = compressed_pages;
@@ -446,7 +541,8 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 		 * subpage::readers and to unlock the page.
 		 */
 		if (fs_info->sectorsize < PAGE_SIZE)
-			btrfs_subpage_start_reader(fs_info, page, cur, add_size);
+			btrfs_subpage_start_reader(fs_info, page_folio(page),
+						   cur, add_size);
 		put_page(page);
 		cur += add_size;
 	}
@@ -489,11 +585,11 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
 		goto out;
 	}
 
-	ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
+	ASSERT(extent_map_is_compressed(em));
 	compressed_len = em->block_len;
 
 	cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ,
-				  end_compressed_bio_read);
+				  end_bbio_comprssed_read);
 
 	cb->start = em->orig_start;
 	em_len = em->len;
@@ -501,7 +597,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
 
 	cb->len = bbio->bio.bi_iter.bi_size;
 	cb->compressed_len = compressed_len;
-	cb->compress_type = em->compress_type;
+	cb->compress_type = extent_map_compression(em);
 	cb->orig_bbio = bbio;
 
 	free_extent_map(em);
@@ -513,7 +609,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
 		goto out_free_bio;
 	}
 
-	ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages);
+	ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages, 0);
 	if (ret2) {
 		ret = BLK_STS_RESOURCE;
 		goto out_free_compressed_pages;
@@ -960,15 +1056,36 @@ int __init btrfs_init_compress(void)
 			offsetof(struct compressed_bio, bbio.bio),
 			BIOSET_NEED_BVECS))
 		return -ENOMEM;
+
+	compr_pool.shrinker = shrinker_alloc(SHRINKER_NONSLAB, "btrfs-compr-pages");
+	if (!compr_pool.shrinker)
+		return -ENOMEM;
+
 	btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE);
 	btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB);
 	btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO);
 	zstd_init_workspace_manager();
+
+	spin_lock_init(&compr_pool.lock);
+	INIT_LIST_HEAD(&compr_pool.list);
+	compr_pool.count = 0;
+	/* 128K / 4K = 32, for 8 threads is 256 pages. */
+	compr_pool.thresh = BTRFS_MAX_COMPRESSED / PAGE_SIZE * 8;
+	compr_pool.shrinker->count_objects = btrfs_compr_pool_count;
+	compr_pool.shrinker->scan_objects = btrfs_compr_pool_scan;
+	compr_pool.shrinker->batch = 32;
+	compr_pool.shrinker->seeks = DEFAULT_SEEKS;
+	shrinker_register(compr_pool.shrinker);
+
 	return 0;
 }
 
 void __cold btrfs_exit_compress(void)
 {
+	/* For now scan drains all pages and does not touch the parameters. */
+	btrfs_compr_pool_scan(NULL, NULL);
+	shrinker_free(compr_pool.shrinker);
+
 	btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE);
 	btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB);
 	btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO);
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 03bb9d143fa7..93cc92974dee 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -32,6 +32,8 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0);
 
 #define	BTRFS_ZLIB_DEFAULT_LEVEL		3
 
+struct page;
+
 struct compressed_bio {
 	/* Number of compressed pages in the array */
 	unsigned int nr_pages;
@@ -96,6 +98,9 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio);
 
 unsigned int btrfs_compress_str2level(unsigned int type, const char *str);
 
+struct page *btrfs_alloc_compr_page(void);
+void btrfs_free_compr_page(struct page *page);
+
 enum btrfs_compression_type {
 	BTRFS_COMPRESS_NONE  = 0,
 	BTRFS_COMPRESS_ZLIB  = 1,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 35c1d24d4a78..e65e012bac55 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -370,33 +370,41 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 /*
  * check if the tree block can be shared by multiple trees
  */
-int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
-			      struct btrfs_root *root,
-			      struct extent_buffer *buf)
+bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct extent_buffer *buf)
 {
+	const u64 buf_gen = btrfs_header_generation(buf);
+
 	/*
 	 * Tree blocks not in shareable trees and tree roots are never shared.
 	 * If a block was allocated after the last snapshot and the block was
 	 * not allocated by tree relocation, we know the block is not shared.
 	 */
-	if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
-	    buf != root->node &&
-	    (btrfs_header_generation(buf) <=
-	     btrfs_root_last_snapshot(&root->root_item) ||
-	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) {
-		if (buf != root->commit_root)
-			return 1;
-		/*
-		 * An extent buffer that used to be the commit root may still be
-		 * shared because the tree height may have increased and it
-		 * became a child of a higher level root. This can happen when
-		 * snapshotting a subvolume created in the current transaction.
-		 */
-		if (btrfs_header_generation(buf) == trans->transid)
-			return 1;
-	}
 
-	return 0;
+	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
+		return false;
+
+	if (buf == root->node)
+		return false;
+
+	if (buf_gen > btrfs_root_last_snapshot(&root->root_item) &&
+	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))
+		return false;
+
+	if (buf != root->commit_root)
+		return true;
+
+	/*
+	 * An extent buffer that used to be the commit root may still be shared
+	 * because the tree height may have increased and it became a child of a
+	 * higher level root. This can happen when snapshotting a subvolume
+	 * created in the current transaction.
+	 */
+	if (buf_gen == trans->transid)
+		return true;
+
+	return false;
 }
 
 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
@@ -812,7 +820,8 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
 	}
 
 	while (low < high) {
-		unsigned long oip;
+		const int unit_size = folio_size(eb->folios[0]);
+		unsigned long oil;
 		unsigned long offset;
 		struct btrfs_disk_key *tmp;
 		struct btrfs_disk_key unaligned;
@@ -820,14 +829,14 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
 
 		mid = (low + high) / 2;
 		offset = p + mid * item_size;
-		oip = offset_in_page(offset);
+		oil = get_eb_offset_in_folio(eb, offset);
 
-		if (oip + key_size <= PAGE_SIZE) {
-			const unsigned long idx = get_eb_page_index(offset);
-			char *kaddr = page_address(eb->pages[idx]);
+		if (oil + key_size <= unit_size) {
+			const unsigned long idx = get_eb_folio_index(eb, offset);
+			char *kaddr = folio_address(eb->folios[idx]);
 
-			oip = get_eb_offset_in_page(eb, offset);
-			tmp = (struct btrfs_disk_key *)(kaddr + oip);
+			oil = get_eb_offset_in_folio(eb, offset);
+			tmp = (struct btrfs_disk_key *)(kaddr + oil);
 		} else {
 			read_extent_buffer(eb, &unaligned, offset, key_size);
 			tmp = &unaligned;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 196c005c31f6..70e828d33177 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -212,8 +212,6 @@ struct btrfs_root {
 
 	u64 last_trans;
 
-	u32 type;
-
 	u64 free_objectid;
 
 	struct btrfs_key defrag_progress;
@@ -224,18 +222,15 @@ struct btrfs_root {
 
 	struct list_head root_list;
 
-	spinlock_t log_extents_lock[2];
-	struct list_head logged_list[2];
-
 	spinlock_t inode_lock;
 	/* red-black tree that keeps track of in-memory inodes */
 	struct rb_root inode_tree;
 
 	/*
-	 * radix tree that keeps track of delayed nodes of every inode,
-	 * protected by inode_lock
+	 * Xarray that keeps track of delayed nodes of every inode, protected
+	 * by @inode_lock.
 	 */
-	struct radix_tree_root delayed_nodes_tree;
+	struct xarray delayed_nodes;
 	/*
 	 * right now this just gets used so that a root has its own devid
 	 * for stat.  It may be used for more later
@@ -561,9 +556,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      struct extent_buffer *buf,
 		      struct extent_buffer **cow_ret, u64 new_root_objectid);
-int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
-			      struct btrfs_root *root,
-			      struct extent_buffer *buf);
+bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct extent_buffer *buf);
 int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		  struct btrfs_path *path, int level, int slot);
 void btrfs_extend_item(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 5244561e2016..c276b136ab63 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -775,7 +775,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
 	 * this em, as either we don't care about the generation, or the
 	 * merged extent map will be rejected anyway.
 	 */
-	if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) &&
+	if (em && (em->flags & EXTENT_FLAG_MERGED) &&
 	    newer_than && em->generation >= newer_than) {
 		free_extent_map(em);
 		em = NULL;
@@ -802,7 +802,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
 static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info,
 				   const struct extent_map *em)
 {
-	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+	if (extent_map_is_compressed(em))
 		return BTRFS_MAX_COMPRESSED;
 	return fs_info->max_extent_size;
 }
@@ -828,7 +828,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
 	/* No more em or hole */
 	if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
 		goto out;
-	if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags))
+	if (next->flags & EXTENT_FLAG_PREALLOC)
 		goto out;
 	/*
 	 * If the next extent is at its max capacity, defragging current extent
@@ -996,10 +996,9 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
 		    em->len <= inode->root->fs_info->max_inline)
 			goto next;
 
-		/* Skip hole/delalloc/preallocated extents */
+		/* Skip holes and preallocated extents. */
 		if (em->block_start == EXTENT_MAP_HOLE ||
-		    em->block_start == EXTENT_MAP_DELALLOC ||
-		    test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+		    (em->flags & EXTENT_FLAG_PREALLOC))
 			goto next;
 
 		/* Skip older extent */
@@ -1190,7 +1189,7 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
 	/* Update the page status */
 	for (i = start_index - first_index; i <= last_index - first_index; i++) {
 		ClearPageChecked(pages[i]);
-		btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len);
+		btrfs_folio_clamp_set_dirty(fs_info, page_folio(pages[i]), start, len);
 	}
 	btrfs_delalloc_release_extents(inode, len);
 	extent_changeset_free(data_reserved);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 7381241334e8..08102883f560 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -71,7 +71,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
 	}
 
 	spin_lock(&root->inode_lock);
-	node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
+	node = xa_load(&root->delayed_nodes, ino);
 
 	if (node) {
 		if (btrfs_inode->delayed_node) {
@@ -83,9 +83,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
 
 		/*
 		 * It's possible that we're racing into the middle of removing
-		 * this node from the radix tree.  In this case, the refcount
+		 * this node from the xarray.  In this case, the refcount
 		 * was zero and it should never go back to one.  Just return
-		 * NULL like it was never in the radix at all; our release
+		 * NULL like it was never in the xarray at all; our release
 		 * function is in the process of removing it.
 		 *
 		 * Some implementations of refcount_inc refuse to bump the
@@ -93,7 +93,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
 		 * here, refcount_inc() may decide to just WARN_ONCE() instead
 		 * of actually bumping the refcount.
 		 *
-		 * If this node is properly in the radix, we want to bump the
+		 * If this node is properly in the xarray, we want to bump the
 		 * refcount twice, once for the inode and once for this get
 		 * operation.
 		 */
@@ -120,6 +120,7 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
 	struct btrfs_root *root = btrfs_inode->root;
 	u64 ino = btrfs_ino(btrfs_inode);
 	int ret;
+	void *ptr;
 
 again:
 	node = btrfs_get_delayed_node(btrfs_inode);
@@ -131,26 +132,30 @@ again:
 		return ERR_PTR(-ENOMEM);
 	btrfs_init_delayed_node(node, root, ino);
 
-	/* cached in the btrfs inode and can be accessed */
+	/* Cached in the inode and can be accessed. */
 	refcount_set(&node->refs, 2);
 
-	ret = radix_tree_preload(GFP_NOFS);
-	if (ret) {
+	/* Allocate and reserve the slot, from now it can return a NULL from xa_load(). */
+	ret = xa_reserve(&root->delayed_nodes, ino, GFP_NOFS);
+	if (ret == -ENOMEM) {
 		kmem_cache_free(delayed_node_cache, node);
-		return ERR_PTR(ret);
+		return ERR_PTR(-ENOMEM);
 	}
-
 	spin_lock(&root->inode_lock);
-	ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
-	if (ret == -EEXIST) {
+	ptr = xa_load(&root->delayed_nodes, ino);
+	if (ptr) {
+		/* Somebody inserted it, go back and read it. */
 		spin_unlock(&root->inode_lock);
 		kmem_cache_free(delayed_node_cache, node);
-		radix_tree_preload_end();
+		node = NULL;
 		goto again;
 	}
+	ptr = xa_store(&root->delayed_nodes, ino, node, GFP_ATOMIC);
+	ASSERT(xa_err(ptr) != -EINVAL);
+	ASSERT(xa_err(ptr) != -ENOMEM);
+	ASSERT(ptr == NULL);
 	btrfs_inode->delayed_node = node;
 	spin_unlock(&root->inode_lock);
-	radix_tree_preload_end();
 
 	return node;
 }
@@ -269,8 +274,7 @@ static void __btrfs_release_delayed_node(
 		 * back up.  We can delete it now.
 		 */
 		ASSERT(refcount_read(&delayed_node->refs) == 0);
-		radix_tree_delete(&root->delayed_nodes_tree,
-				  delayed_node->inode_id);
+		xa_erase(&root->delayed_nodes, delayed_node->inode_id);
 		spin_unlock(&root->inode_lock);
 		kmem_cache_free(delayed_node_cache, delayed_node);
 	}
@@ -1036,14 +1040,33 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
 	if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
 		goto out;
 
-	path->slots[0]++;
-	if (path->slots[0] >= btrfs_header_nritems(leaf))
-		goto search;
-again:
+	/*
+	 * Now we're going to delete the INODE_REF/EXTREF, which should be the
+	 * only one ref left.  Check if the next item is an INODE_REF/EXTREF.
+	 *
+	 * But if we're the last item already, release and search for the last
+	 * INODE_REF/EXTREF.
+	 */
+	if (path->slots[0] + 1 >= btrfs_header_nritems(leaf)) {
+		key.objectid = node->inode_id;
+		key.type = BTRFS_INODE_EXTREF_KEY;
+		key.offset = (u64)-1;
+
+		btrfs_release_path(path);
+		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+		if (ret < 0)
+			goto err_out;
+		ASSERT(ret > 0);
+		ASSERT(path->slots[0] > 0);
+		ret = 0;
+		path->slots[0]--;
+		leaf = path->nodes[0];
+	} else {
+		path->slots[0]++;
+	}
 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 	if (key.objectid != node->inode_id)
 		goto out;
-
 	if (key.type != BTRFS_INODE_REF_KEY &&
 	    key.type != BTRFS_INODE_EXTREF_KEY)
 		goto out;
@@ -1070,22 +1093,6 @@ err_out:
 		btrfs_abort_transaction(trans, ret);
 
 	return ret;
-
-search:
-	btrfs_release_path(path);
-
-	key.type = BTRFS_INODE_EXTREF_KEY;
-	key.offset = -1;
-
-	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-	if (ret < 0)
-		goto err_out;
-	ASSERT(ret);
-
-	ret = 0;
-	leaf = path->nodes[0];
-	path->slots[0]--;
-	goto again;
 }
 
 static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
@@ -2035,34 +2042,36 @@ void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode)
 
 void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
 {
-	u64 inode_id = 0;
+	unsigned long index = 0;
 	struct btrfs_delayed_node *delayed_nodes[8];
-	int i, n;
 
 	while (1) {
+		struct btrfs_delayed_node *node;
+		int count;
+
 		spin_lock(&root->inode_lock);
-		n = radix_tree_gang_lookup(&root->delayed_nodes_tree,
-					   (void **)delayed_nodes, inode_id,
-					   ARRAY_SIZE(delayed_nodes));
-		if (!n) {
+		if (xa_empty(&root->delayed_nodes)) {
 			spin_unlock(&root->inode_lock);
-			break;
+			return;
 		}
 
-		inode_id = delayed_nodes[n - 1]->inode_id + 1;
-		for (i = 0; i < n; i++) {
+		count = 0;
+		xa_for_each_start(&root->delayed_nodes, index, node, index) {
 			/*
 			 * Don't increase refs in case the node is dead and
 			 * about to be removed from the tree in the loop below
 			 */
-			if (!refcount_inc_not_zero(&delayed_nodes[i]->refs))
-				delayed_nodes[i] = NULL;
+			if (refcount_inc_not_zero(&node->refs)) {
+				delayed_nodes[count] = node;
+				count++;
+			}
+			if (count >= ARRAY_SIZE(delayed_nodes))
+				break;
 		}
 		spin_unlock(&root->inode_lock);
+		index++;
 
-		for (i = 0; i < n; i++) {
-			if (!delayed_nodes[i])
-				continue;
+		for (int i = 0; i < count; i++) {
 			__btrfs_kill_delayed_node(delayed_nodes[i]);
 			btrfs_release_delayed_node(delayed_nodes[i]);
 		}
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index f9544fda38e9..1502d664c892 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -550,8 +550,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
 				      u64 physical)
 {
 	struct btrfs_fs_info *fs_info = cache->fs_info;
-	struct extent_map *em;
-	struct map_lookup *map;
+	struct btrfs_chunk_map *map;
 	u64 chunk_offset = cache->start;
 	int num_extents, cur_extent;
 	int i;
@@ -567,9 +566,8 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
 	}
 	spin_unlock(&cache->lock);
 
-	em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
-	ASSERT(!IS_ERR(em));
-	map = em->map_lookup;
+	map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
+	ASSERT(!IS_ERR(map));
 
 	num_extents = 0;
 	cur_extent = 0;
@@ -583,7 +581,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
 			cur_extent = i;
 	}
 
-	free_extent_map(em);
+	btrfs_free_chunk_map(map);
 
 	if (num_extents > 1 && cur_extent < num_extents - 1) {
 		/*
@@ -812,25 +810,23 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
 						struct btrfs_device *srcdev,
 						struct btrfs_device *tgtdev)
 {
-	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
-	struct extent_map *em;
-	struct map_lookup *map;
 	u64 start = 0;
 	int i;
 
-	write_lock(&em_tree->lock);
+	write_lock(&fs_info->mapping_tree_lock);
 	do {
-		em = lookup_extent_mapping(em_tree, start, (u64)-1);
-		if (!em)
+		struct btrfs_chunk_map *map;
+
+		map = btrfs_find_chunk_map_nolock(fs_info, start, U64_MAX);
+		if (!map)
 			break;
-		map = em->map_lookup;
 		for (i = 0; i < map->num_stripes; i++)
 			if (srcdev == map->stripes[i].dev)
 				map->stripes[i].dev = tgtdev;
-		start = em->start + em->len;
-		free_extent_map(em);
+		start = map->start + map->chunk_len;
+		btrfs_free_chunk_map(map);
 	} while (start);
-	write_unlock(&em_tree->lock);
+	write_unlock(&fs_info->mapping_tree_lock);
 }
 
 static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 62cb97f7c94f..c6907d533fe8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -74,20 +74,37 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
 static void csum_tree_block(struct extent_buffer *buf, u8 *result)
 {
 	struct btrfs_fs_info *fs_info = buf->fs_info;
-	const int num_pages = num_extent_pages(buf);
-	const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
+	int num_pages;
+	u32 first_page_part;
 	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
 	char *kaddr;
 	int i;
 
 	shash->tfm = fs_info->csum_shash;
 	crypto_shash_init(shash);
-	kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start);
+
+	if (buf->addr) {
+		/* Pages are contiguous, handle them as a big one. */
+		kaddr = buf->addr;
+		first_page_part = fs_info->nodesize;
+		num_pages = 1;
+	} else {
+		kaddr = folio_address(buf->folios[0]);
+		first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
+		num_pages = num_extent_pages(buf);
+	}
+
 	crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
 			    first_page_part - BTRFS_CSUM_SIZE);
 
+	/*
+	 * Multiple single-page folios case would reach here.
+	 *
+	 * nodesize <= PAGE_SIZE and large folio all handled by above
+	 * crypto_shash_update() already.
+	 */
 	for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) {
-		kaddr = page_address(buf->pages[i]);
+		kaddr = folio_address(buf->folios[i]);
 		crypto_shash_update(shash, kaddr, PAGE_SIZE);
 	}
 	memset(result, 0, BTRFS_CSUM_SIZE);
@@ -166,20 +183,22 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
 				      int mirror_num)
 {
 	struct btrfs_fs_info *fs_info = eb->fs_info;
-	int i, num_pages = num_extent_pages(eb);
+	int num_folios = num_extent_folios(eb);
 	int ret = 0;
 
 	if (sb_rdonly(fs_info->sb))
 		return -EROFS;
 
-	for (i = 0; i < num_pages; i++) {
-		struct page *p = eb->pages[i];
-		u64 start = max_t(u64, eb->start, page_offset(p));
-		u64 end = min_t(u64, eb->start + eb->len, page_offset(p) + PAGE_SIZE);
+	for (int i = 0; i < num_folios; i++) {
+		struct folio *folio = eb->folios[i];
+		u64 start = max_t(u64, eb->start, folio_pos(folio));
+		u64 end = min_t(u64, eb->start + eb->len,
+				folio_pos(folio) + folio_size(folio));
 		u32 len = end - start;
 
 		ret = btrfs_repair_io_failure(fs_info, 0, start, len,
-				start, p, offset_in_page(start), mirror_num);
+					      start, folio, offset_in_folio(folio, start),
+					      mirror_num);
 		if (ret)
 			break;
 	}
@@ -254,15 +273,20 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
 	if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len))
 		return BLK_STS_IOERR;
 
-	if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) {
-		WARN_ON_ONCE(found_start != 0);
+	/*
+	 * If an extent_buffer is marked as EXTENT_BUFFER_ZONED_ZEROOUT, don't
+	 * checksum it but zero-out its content. This is done to preserve
+	 * ordering of I/O without unnecessarily writing out data.
+	 */
+	if (test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)) {
+		memzero_extent_buffer(eb, 0, eb->len);
 		return BLK_STS_OK;
 	}
 
 	if (WARN_ON_ONCE(found_start != eb->start))
 		return BLK_STS_IOERR;
-	if (WARN_ON(!btrfs_page_test_uptodate(fs_info, eb->pages[0], eb->start,
-					      eb->len)))
+	if (WARN_ON(!btrfs_folio_test_uptodate(fs_info, eb->folios[0],
+					       eb->start, eb->len)))
 		return BLK_STS_IOERR;
 
 	ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
@@ -371,8 +395,8 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
 	}
 
 	csum_tree_block(eb, result);
-	header_csum = page_address(eb->pages[0]) +
-		get_eb_offset_in_page(eb, offsetof(struct btrfs_header, csum));
+	header_csum = folio_address(eb->folios[0]) +
+		get_eb_offset_in_folio(eb, offsetof(struct btrfs_header, csum));
 
 	if (memcmp(result, header_csum, csum_size) != 0) {
 		btrfs_warn_rl(fs_info,
@@ -639,7 +663,8 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
 	root->nr_delalloc_inodes = 0;
 	root->nr_ordered_extents = 0;
 	root->inode_tree = RB_ROOT;
-	INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
+	/* GFP flags are compatible with XA_FLAGS_*. */
+	xa_init_flags(&root->delayed_nodes, GFP_ATOMIC);
 
 	btrfs_init_root_block_rsv(root);
 
@@ -650,14 +675,10 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
 	INIT_LIST_HEAD(&root->ordered_extents);
 	INIT_LIST_HEAD(&root->ordered_root);
 	INIT_LIST_HEAD(&root->reloc_dirty_list);
-	INIT_LIST_HEAD(&root->logged_list[0]);
-	INIT_LIST_HEAD(&root->logged_list[1]);
 	spin_lock_init(&root->inode_lock);
 	spin_lock_init(&root->delalloc_lock);
 	spin_lock_init(&root->ordered_extent_lock);
 	spin_lock_init(&root->accounting_lock);
-	spin_lock_init(&root->log_extents_lock[0]);
-	spin_lock_init(&root->log_extents_lock[1]);
 	spin_lock_init(&root->qgroup_meta_rsv_lock);
 	mutex_init(&root->objectid_mutex);
 	mutex_init(&root->log_mutex);
@@ -2618,9 +2639,6 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
 			 */
 			btrfs_set_super_log_root(sb, 0);
 
-			/* We can't trust the free space cache either */
-			btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
-
 			btrfs_warn(fs_info, "try to load backup roots slot %d", i);
 			ret = read_backup_root(fs_info, i);
 			backup_index = ret;
@@ -2724,7 +2742,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
 	INIT_LIST_HEAD(&fs_info->allocated_ebs);
 	spin_lock_init(&fs_info->eb_leak_lock);
 #endif
-	extent_map_tree_init(&fs_info->mapping_tree);
+	fs_info->mapping_tree = RB_ROOT_CACHED;
+	rwlock_init(&fs_info->mapping_tree_lock);
 	btrfs_init_block_rsv(&fs_info->global_block_rsv,
 			     BTRFS_BLOCK_RSV_GLOBAL);
 	btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
@@ -2794,6 +2813,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
 	fs_info->sectorsize_bits = ilog2(4096);
 	fs_info->stripesize = 4096;
 
+	/* Default compress algorithm when user does -o compress */
+	fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
+
 	fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE;
 
 	spin_lock_init(&fs_info->swapfile_pins_lock);
@@ -2931,17 +2953,6 @@ out:
 }
 
 /*
- * Some options only have meaning at mount time and shouldn't persist across
- * remounts, or be displayed. Clear these at the end of mount and remount
- * code paths.
- */
-void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info)
-{
-	btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
-	btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE);
-}
-
-/*
  * Mounting logic specific to read-write file systems. Shared by open_ctree
  * and btrfs_remount when remounting from read-only to read-write.
  */
@@ -2953,7 +2964,11 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
 
 	if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
 	    btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
-		rebuild_free_space_tree = true;
+		if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+			btrfs_warn(fs_info,
+				   "'clear_cache' option is ignored with extent tree v2");
+		else
+			rebuild_free_space_tree = true;
 	} else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
 		   !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
 		btrfs_warn(fs_info, "free space tree is invalid");
@@ -3276,13 +3291,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
 		WRITE_ONCE(fs_info->fs_error, -EUCLEAN);
 
-	/*
-	 * In the long term, we'll store the compression type in the super
-	 * block, and it'll be used for per file compression control.
-	 */
-	fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
-
-
 	/* Set up fs_info before parsing mount options */
 	nodesize = btrfs_super_nodesize(disk_super);
 	sectorsize = btrfs_super_sectorsize(disk_super);
@@ -3296,28 +3304,30 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
 	fs_info->stripesize = stripesize;
 
-	ret = btrfs_parse_options(fs_info, options, sb->s_flags);
-	if (ret)
+	/*
+	 * Handle the space caching options appropriately now that we have the
+	 * super block loaded and validated.
+	 */
+	btrfs_set_free_space_cache_settings(fs_info);
+
+	if (!btrfs_check_options(fs_info, &fs_info->mount_opt, sb->s_flags)) {
+		ret = -EINVAL;
 		goto fail_alloc;
+	}
 
 	ret = btrfs_check_features(fs_info, !sb_rdonly(sb));
 	if (ret < 0)
 		goto fail_alloc;
 
+	/*
+	 * At this point our mount options are validated, if we set ->max_inline
+	 * to something non-standard make sure we truncate it to sectorsize.
+	 */
+	fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize);
+
 	if (sectorsize < PAGE_SIZE) {
 		struct btrfs_subpage_info *subpage_info;
 
-		/*
-		 * V1 space cache has some hardcoded PAGE_SIZE usage, and is
-		 * going to be deprecated.
-		 *
-		 * Force to use v2 cache for subpage case.
-		 */
-		btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
-		btrfs_set_and_info(fs_info, FREE_SPACE_TREE,
-			"forcing free space tree for sector size %u with page size %lu",
-			sectorsize, PAGE_SIZE);
-
 		btrfs_warn(fs_info,
 		"read-write for sector size %u with page size %lu is experimental",
 			   sectorsize, PAGE_SIZE);
@@ -3494,29 +3504,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 		goto fail_cleaner;
 	}
 
-	if (!btrfs_test_opt(fs_info, NOSSD) &&
-	    !fs_info->fs_devices->rotating) {
-		btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations");
-	}
-
-	/*
-	 * For devices supporting discard turn on discard=async automatically,
-	 * unless it's already set or disabled. This could be turned off by
-	 * nodiscard for the same mount.
-	 *
-	 * The zoned mode piggy backs on the discard functionality for
-	 * resetting a zone. There is no reason to delay the zone reset as it is
-	 * fast enough. So, do not enable async discard for zoned mode.
-	 */
-	if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) ||
-	      btrfs_test_opt(fs_info, DISCARD_ASYNC) ||
-	      btrfs_test_opt(fs_info, NODISCARD)) &&
-	    fs_info->fs_devices->discardable &&
-	    !btrfs_is_zoned(fs_info)) {
-		btrfs_set_and_info(fs_info, DISCARD_ASYNC,
-				   "auto enabling async discard");
-	}
-
 	ret = btrfs_read_qgroup_config(fs_info);
 	if (ret)
 		goto fail_trans_kthread;
@@ -3542,7 +3529,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	}
 
 	if (sb_rdonly(sb))
-		goto clear_oneshot;
+		return 0;
 
 	ret = btrfs_start_pre_rw_mount(fs_info);
 	if (ret) {
@@ -3570,8 +3557,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags))
 		wake_up_process(fs_info->cleaner_kthread);
 
-clear_oneshot:
-	btrfs_clear_oneshot_options(fs_info);
 	return 0;
 
 fail_qgroup:
@@ -3608,7 +3593,7 @@ fail_sb_buffer:
 	btrfs_stop_all_workers(fs_info);
 	btrfs_free_block_groups(fs_info);
 fail_alloc:
-	btrfs_mapping_tree_free(&fs_info->mapping_tree);
+	btrfs_mapping_tree_free(fs_info);
 
 	iput(fs_info->btree_inode);
 fail:
@@ -4391,7 +4376,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
 
 	iput(fs_info->btree_inode);
 
-	btrfs_mapping_tree_free(&fs_info->mapping_tree);
+	btrfs_mapping_tree_free(fs_info);
 	btrfs_close_devices(fs_info->fs_devices);
 }
 
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 50dab8f639dc..9413726b329b 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -37,9 +37,6 @@ struct extent_buffer *btrfs_find_create_tree_block(
 						struct btrfs_fs_info *fs_info,
 						u64 bytenr, u64 owner_root,
 						int level);
-void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
-			      struct extent_buffer *buf);
-void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info);
 int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info);
 int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
 			   const struct btrfs_super_block *disk_sb);
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index ea149be28dff..e3ee5449cc4a 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -58,12 +58,13 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
 						       struct extent_io_tree *tree,
 						       u64 start, u64 end)
 {
-	struct btrfs_inode *inode = tree->inode;
+	const struct btrfs_inode *inode;
 	u64 isize;
 
-	if (!inode)
+	if (tree->owner != IO_TREE_INODE_IO)
 		return;
 
+	inode = extent_io_tree_to_inode_const(tree);
 	isize = i_size_read(&inode->vfs_inode);
 	if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
 		btrfs_debug_rl(inode->root->fs_info,
@@ -78,31 +79,46 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
 #define btrfs_debug_check_extent_io_range(c, s, e)	do {} while (0)
 #endif
 
+
 /*
- * For the file_extent_tree, we want to hold the inode lock when we lookup and
- * update the disk_i_size, but lockdep will complain because our io_tree we hold
- * the tree lock and get the inode lock when setting delalloc.  These two things
- * are unrelated, so make a class for the file_extent_tree so we don't get the
- * two locking patterns mixed up.
+ * The only tree allowed to set the inode is IO_TREE_INODE_IO.
  */
-static struct lock_class_key file_extent_tree_class;
+static bool is_inode_io_tree(const struct extent_io_tree *tree)
+{
+	return tree->owner == IO_TREE_INODE_IO;
+}
+
+/* Return the inode if it's valid for the given tree, otherwise NULL. */
+struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree)
+{
+	if (tree->owner == IO_TREE_INODE_IO)
+		return tree->inode;
+	return NULL;
+}
+
+/* Read-only access to the inode. */
+const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree)
+{
+	if (tree->owner == IO_TREE_INODE_IO)
+		return tree->inode;
+	return NULL;
+}
 
-struct tree_entry {
-	u64 start;
-	u64 end;
-	struct rb_node rb_node;
-};
+/* For read-only access to fs_info. */
+const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree)
+{
+	if (tree->owner == IO_TREE_INODE_IO)
+		return tree->inode->root->fs_info;
+	return tree->fs_info;
+}
 
 void extent_io_tree_init(struct btrfs_fs_info *fs_info,
 			 struct extent_io_tree *tree, unsigned int owner)
 {
-	tree->fs_info = fs_info;
 	tree->state = RB_ROOT;
 	spin_lock_init(&tree->lock);
-	tree->inode = NULL;
+	tree->fs_info = fs_info;
 	tree->owner = owner;
-	if (owner == IO_TREE_INODE_FILE_EXTENT)
-		lockdep_set_class(&tree->lock, &file_extent_tree_class);
 }
 
 /*
@@ -329,10 +345,14 @@ static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64
 	return tree_search_for_insert(tree, offset, NULL, NULL);
 }
 
-static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
+static void extent_io_tree_panic(const struct extent_io_tree *tree,
+				 const struct extent_state *state,
+				 const char *opname,
+				 int err)
 {
-	btrfs_panic(tree->fs_info, err,
-	"locking error: extent tree was modified by another thread while locked");
+	btrfs_panic(extent_io_tree_to_fs_info(tree), err,
+		    "extent io tree error on %s state start %llu end %llu",
+		    opname, state->start, state->end);
 }
 
 static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *state)
@@ -341,8 +361,9 @@ static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *s
 
 	prev = prev_state(state);
 	if (prev && prev->end == state->start - 1 && prev->state == state->state) {
-		if (tree->inode)
-			btrfs_merge_delalloc_extent(tree->inode, state, prev);
+		if (is_inode_io_tree(tree))
+			btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree),
+						    state, prev);
 		state->start = prev->start;
 		rb_erase(&prev->rb_node, &tree->state);
 		RB_CLEAR_NODE(&prev->rb_node);
@@ -356,8 +377,9 @@ static void merge_next_state(struct extent_io_tree *tree, struct extent_state *s
 
 	next = next_state(state);
 	if (next && next->start == state->end + 1 && next->state == state->state) {
-		if (tree->inode)
-			btrfs_merge_delalloc_extent(tree->inode, state, next);
+		if (is_inode_io_tree(tree))
+			btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree),
+						    state, next);
 		state->end = next->end;
 		rb_erase(&next->rb_node, &tree->state);
 		RB_CLEAR_NODE(&next->rb_node);
@@ -390,8 +412,8 @@ static void set_state_bits(struct extent_io_tree *tree,
 	u32 bits_to_set = bits & ~EXTENT_CTLBITS;
 	int ret;
 
-	if (tree->inode)
-		btrfs_set_delalloc_extent(tree->inode, state, bits);
+	if (is_inode_io_tree(tree))
+		btrfs_set_delalloc_extent(extent_io_tree_to_inode(tree), state, bits);
 
 	ret = add_extent_changeset(state, bits_to_set, changeset, 1);
 	BUG_ON(ret < 0);
@@ -436,9 +458,10 @@ static struct extent_state *insert_state(struct extent_io_tree *tree,
 		if (state->end < entry->start) {
 			if (try_merge && end == entry->start &&
 			    state->state == entry->state) {
-				if (tree->inode)
-					btrfs_merge_delalloc_extent(tree->inode,
-								    state, entry);
+				if (is_inode_io_tree(tree))
+					btrfs_merge_delalloc_extent(
+							extent_io_tree_to_inode(tree),
+							state, entry);
 				entry->start = state->start;
 				merge_prev_state(tree, entry);
 				state->state = 0;
@@ -448,9 +471,10 @@ static struct extent_state *insert_state(struct extent_io_tree *tree,
 		} else if (state->end > entry->end) {
 			if (try_merge && entry->end == start &&
 			    state->state == entry->state) {
-				if (tree->inode)
-					btrfs_merge_delalloc_extent(tree->inode,
-								    state, entry);
+				if (is_inode_io_tree(tree))
+					btrfs_merge_delalloc_extent(
+							extent_io_tree_to_inode(tree),
+							state, entry);
 				entry->end = state->end;
 				merge_next_state(tree, entry);
 				state->state = 0;
@@ -458,9 +482,6 @@ static struct extent_state *insert_state(struct extent_io_tree *tree,
 			}
 			node = &(*node)->rb_right;
 		} else {
-			btrfs_err(tree->fs_info,
-			       "found node %llu %llu on insert of %llu %llu",
-			       entry->start, entry->end, state->start, state->end);
 			return ERR_PTR(-EEXIST);
 		}
 	}
@@ -505,8 +526,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 	struct rb_node *parent = NULL;
 	struct rb_node **node;
 
-	if (tree->inode)
-		btrfs_split_delalloc_extent(tree->inode, orig, split);
+	if (is_inode_io_tree(tree))
+		btrfs_split_delalloc_extent(extent_io_tree_to_inode(tree), orig,
+					    split);
 
 	prealloc->start = orig->start;
 	prealloc->end = split - 1;
@@ -553,8 +575,9 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
 	u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
 	int ret;
 
-	if (tree->inode)
-		btrfs_clear_delalloc_extent(tree->inode, state, bits);
+	if (is_inode_io_tree(tree))
+		btrfs_clear_delalloc_extent(extent_io_tree_to_inode(tree), state,
+					    bits);
 
 	ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
 	BUG_ON(ret < 0);
@@ -695,7 +718,7 @@ hit_next:
 			goto search_again;
 		err = split_state(tree, state, prealloc, start);
 		if (err)
-			extent_io_tree_panic(tree, err);
+			extent_io_tree_panic(tree, state, "split", err);
 
 		prealloc = NULL;
 		if (err)
@@ -717,7 +740,7 @@ hit_next:
 			goto search_again;
 		err = split_state(tree, state, prealloc, end + 1);
 		if (err)
-			extent_io_tree_panic(tree, err);
+			extent_io_tree_panic(tree, state, "split", err);
 
 		if (wake)
 			wake_up(&state->wq);
@@ -939,6 +962,8 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
 	struct extent_state *state;
 	int ret = 1;
 
+	ASSERT(!btrfs_fs_incompat(extent_io_tree_to_fs_info(tree), NO_HOLES));
+
 	spin_lock(&tree->lock);
 	state = find_first_extent_bit_state(tree, start, bits);
 	if (state) {
@@ -1152,7 +1177,7 @@ hit_next:
 			goto search_again;
 		err = split_state(tree, state, prealloc, start);
 		if (err)
-			extent_io_tree_panic(tree, err);
+			extent_io_tree_panic(tree, state, "split", err);
 
 		prealloc = NULL;
 		if (err)
@@ -1200,7 +1225,7 @@ hit_next:
 		inserted_state = insert_state(tree, prealloc, bits, changeset);
 		if (IS_ERR(inserted_state)) {
 			err = PTR_ERR(inserted_state);
-			extent_io_tree_panic(tree, err);
+			extent_io_tree_panic(tree, prealloc, "insert", err);
 		}
 
 		cache_state(inserted_state, cached_state);
@@ -1228,7 +1253,7 @@ hit_next:
 			goto search_again;
 		err = split_state(tree, state, prealloc, end + 1);
 		if (err)
-			extent_io_tree_panic(tree, err);
+			extent_io_tree_panic(tree, state, "split", err);
 
 		set_state_bits(tree, prealloc, bits, changeset);
 		cache_state(prealloc, cached_state);
@@ -1382,7 +1407,7 @@ hit_next:
 		}
 		err = split_state(tree, state, prealloc, start);
 		if (err)
-			extent_io_tree_panic(tree, err);
+			extent_io_tree_panic(tree, state, "split", err);
 		prealloc = NULL;
 		if (err)
 			goto out;
@@ -1430,7 +1455,7 @@ hit_next:
 		inserted_state = insert_state(tree, prealloc, bits, NULL);
 		if (IS_ERR(inserted_state)) {
 			err = PTR_ERR(inserted_state);
-			extent_io_tree_panic(tree, err);
+			extent_io_tree_panic(tree, prealloc, "insert", err);
 		}
 		cache_state(inserted_state, cached_state);
 		if (inserted_state == prealloc)
@@ -1453,7 +1478,7 @@ hit_next:
 
 		err = split_state(tree, state, prealloc, end + 1);
 		if (err)
-			extent_io_tree_panic(tree, err);
+			extent_io_tree_panic(tree, state, "split", err);
 
 		set_state_bits(tree, prealloc, bits, NULL);
 		cache_state(prealloc, cached_state);
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index 5602b0137fcd..ebe6390d65e9 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -87,9 +87,17 @@ enum {
 
 struct extent_io_tree {
 	struct rb_root state;
-	struct btrfs_fs_info *fs_info;
-	/* Inode associated with this tree, or NULL. */
-	struct btrfs_inode *inode;
+	/*
+	 * The fs_info is needed for trace points, a tree attached to an inode
+	 * needs the inode.
+	 *
+	 * owner == IO_TREE_INODE_IO - then inode is valid and fs_info can be
+	 *                             accessed as inode->root->fs_info
+	 */
+	union {
+		struct btrfs_fs_info *fs_info;
+		struct btrfs_inode *inode;
+	};
 
 	/* Who owns this io tree, should be one of IO_TREE_* */
 	u8 owner;
@@ -112,6 +120,10 @@ struct extent_state {
 #endif
 };
 
+struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree);
+const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree);
+const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree);
+
 void extent_io_tree_init(struct btrfs_fs_info *fs_info,
 			 struct extent_io_tree *tree, unsigned int owner);
 void extent_io_tree_release(struct extent_io_tree *tree);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 01423670bc8a..f396aba92c57 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3447,6 +3447,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_ref generic_ref = { 0 };
+	struct btrfs_block_group *bg;
 	int ret;
 
 	btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
@@ -3460,67 +3461,64 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 		BUG_ON(ret); /* -ENOMEM */
 	}
 
-	if (last_ref && btrfs_header_generation(buf) == trans->transid) {
-		struct btrfs_block_group *cache;
-		bool must_pin = false;
-
-		if (root_id != BTRFS_TREE_LOG_OBJECTID) {
-			ret = check_ref_cleanup(trans, buf->start);
-			if (!ret) {
-				btrfs_redirty_list_add(trans->transaction, buf);
-				goto out;
-			}
-		}
+	if (!last_ref)
+		return;
 
-		cache = btrfs_lookup_block_group(fs_info, buf->start);
+	if (btrfs_header_generation(buf) != trans->transid)
+		goto out;
 
-		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
-			pin_down_extent(trans, cache, buf->start, buf->len, 1);
-			btrfs_put_block_group(cache);
+	if (root_id != BTRFS_TREE_LOG_OBJECTID) {
+		ret = check_ref_cleanup(trans, buf->start);
+		if (!ret)
 			goto out;
-		}
+	}
 
-		/*
-		 * If there are tree mod log users we may have recorded mod log
-		 * operations for this node.  If we re-allocate this node we
-		 * could replay operations on this node that happened when it
-		 * existed in a completely different root.  For example if it
-		 * was part of root A, then was reallocated to root B, and we
-		 * are doing a btrfs_old_search_slot(root b), we could replay
-		 * operations that happened when the block was part of root A,
-		 * giving us an inconsistent view of the btree.
-		 *
-		 * We are safe from races here because at this point no other
-		 * node or root points to this extent buffer, so if after this
-		 * check a new tree mod log user joins we will not have an
-		 * existing log of operations on this node that we have to
-		 * contend with.
-		 */
-		if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
-			must_pin = true;
+	bg = btrfs_lookup_block_group(fs_info, buf->start);
 
-		if (must_pin || btrfs_is_zoned(fs_info)) {
-			btrfs_redirty_list_add(trans->transaction, buf);
-			pin_down_extent(trans, cache, buf->start, buf->len, 1);
-			btrfs_put_block_group(cache);
-			goto out;
-		}
+	if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+		pin_down_extent(trans, bg, buf->start, buf->len, 1);
+		btrfs_put_block_group(bg);
+		goto out;
+	}
 
-		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
+	/*
+	 * If there are tree mod log users we may have recorded mod log
+	 * operations for this node.  If we re-allocate this node we
+	 * could replay operations on this node that happened when it
+	 * existed in a completely different root.  For example if it
+	 * was part of root A, then was reallocated to root B, and we
+	 * are doing a btrfs_old_search_slot(root b), we could replay
+	 * operations that happened when the block was part of root A,
+	 * giving us an inconsistent view of the btree.
+	 *
+	 * We are safe from races here because at this point no other
+	 * node or root points to this extent buffer, so if after this
+	 * check a new tree mod log user joins we will not have an
+	 * existing log of operations on this node that we have to
+	 * contend with.
+	 */
 
-		btrfs_add_free_space(cache, buf->start, buf->len);
-		btrfs_free_reserved_bytes(cache, buf->len, 0);
-		btrfs_put_block_group(cache);
-		trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
+	if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)
+		     || btrfs_is_zoned(fs_info)) {
+		pin_down_extent(trans, bg, buf->start, buf->len, 1);
+		btrfs_put_block_group(bg);
+		goto out;
 	}
+
+	WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
+
+	btrfs_add_free_space(bg, buf->start, buf->len);
+	btrfs_free_reserved_bytes(bg, buf->len, 0);
+	btrfs_put_block_group(bg);
+	trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
+
 out:
-	if (last_ref) {
-		/*
-		 * Deleting the buffer, clear the corrupt flag since it doesn't
-		 * matter anymore.
-		 */
-		clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
-	}
+
+	/*
+	 * Deleting the buffer, clear the corrupt flag since it doesn't
+	 * matter anymore.
+	 */
+	clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
 }
 
 /* Can return -ENOMEM */
@@ -5061,7 +5059,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	__btrfs_tree_lock(buf, nest);
 	btrfs_clear_buffer_dirty(trans, buf);
 	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
-	clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags);
+	clear_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &buf->bflags);
 
 	set_extent_buffer_uptodate(buf);
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index b6ff6f320198..cfd2967f04a2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -184,22 +184,23 @@ static void process_one_page(struct btrfs_fs_info *fs_info,
 			     struct page *page, struct page *locked_page,
 			     unsigned long page_ops, u64 start, u64 end)
 {
+	struct folio *folio = page_folio(page);
 	u32 len;
 
 	ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX);
 	len = end + 1 - start;
 
 	if (page_ops & PAGE_SET_ORDERED)
-		btrfs_page_clamp_set_ordered(fs_info, page, start, len);
+		btrfs_folio_clamp_set_ordered(fs_info, folio, start, len);
 	if (page_ops & PAGE_START_WRITEBACK) {
-		btrfs_page_clamp_clear_dirty(fs_info, page, start, len);
-		btrfs_page_clamp_set_writeback(fs_info, page, start, len);
+		btrfs_folio_clamp_clear_dirty(fs_info, folio, start, len);
+		btrfs_folio_clamp_set_writeback(fs_info, folio, start, len);
 	}
 	if (page_ops & PAGE_END_WRITEBACK)
-		btrfs_page_clamp_clear_writeback(fs_info, page, start, len);
+		btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len);
 
 	if (page != locked_page && (page_ops & PAGE_UNLOCK))
-		btrfs_page_end_writer_lock(fs_info, page, start, len);
+		btrfs_folio_end_writer_lock(fs_info, folio, start, len);
 }
 
 static void __process_pages_contig(struct address_space *mapping,
@@ -271,19 +272,20 @@ static noinline int lock_delalloc_pages(struct inode *inode,
 			goto out;
 
 		for (i = 0; i < found_folios; i++) {
-			struct page *page = &fbatch.folios[i]->page;
+			struct folio *folio = fbatch.folios[i];
+			struct page *page = folio_page(folio, 0);
 			u32 len = end + 1 - start;
 
 			if (page == locked_page)
 				continue;
 
-			if (btrfs_page_start_writer_lock(fs_info, page, start,
-							 len))
+			if (btrfs_folio_start_writer_lock(fs_info, folio, start,
+							  len))
 				goto out;
 
 			if (!PageDirty(page) || page->mapping != mapping) {
-				btrfs_page_end_writer_lock(fs_info, page, start,
-							   len);
+				btrfs_folio_end_writer_lock(fs_info, folio, start,
+							    len);
 				goto out;
 			}
 
@@ -432,60 +434,65 @@ static bool btrfs_verify_page(struct page *page, u64 start)
 static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+	struct folio *folio = page_folio(page);
 
 	ASSERT(page_offset(page) <= start &&
 	       start + len <= page_offset(page) + PAGE_SIZE);
 
 	if (uptodate && btrfs_verify_page(page, start))
-		btrfs_page_set_uptodate(fs_info, page, start, len);
+		btrfs_folio_set_uptodate(fs_info, folio, start, len);
 	else
-		btrfs_page_clear_uptodate(fs_info, page, start, len);
+		btrfs_folio_clear_uptodate(fs_info, folio, start, len);
 
-	if (!btrfs_is_subpage(fs_info, page))
+	if (!btrfs_is_subpage(fs_info, page->mapping))
 		unlock_page(page);
 	else
-		btrfs_subpage_end_reader(fs_info, page, start, len);
+		btrfs_subpage_end_reader(fs_info, folio, start, len);
 }
 
 /*
- * after a writepage IO is done, we need to:
- * clear the uptodate bits on error
- * clear the writeback bits in the extent tree for this IO
- * end_page_writeback if the page has no more pending IO
+ * After a write IO is done, we need to:
+ *
+ * - clear the uptodate bits on error
+ * - clear the writeback bits in the extent tree for the range
+ * - filio_end_writeback()  if there is no more pending io for the folio
  *
  * Scheduling is not allowed, so the extent state tree is expected
  * to have one and only one object corresponding to this IO.
  */
-static void end_bio_extent_writepage(struct btrfs_bio *bbio)
+static void end_bbio_data_write(struct btrfs_bio *bbio)
 {
 	struct bio *bio = &bbio->bio;
 	int error = blk_status_to_errno(bio->bi_status);
-	struct bio_vec *bvec;
-	struct bvec_iter_all iter_all;
+	struct folio_iter fi;
 
 	ASSERT(!bio_flagged(bio, BIO_CLONED));
-	bio_for_each_segment_all(bvec, bio, iter_all) {
-		struct page *page = bvec->bv_page;
-		struct inode *inode = page->mapping->host;
+	bio_for_each_folio_all(fi, bio) {
+		struct folio *folio = fi.folio;
+		struct inode *inode = folio->mapping->host;
 		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 		const u32 sectorsize = fs_info->sectorsize;
-		u64 start = page_offset(page) + bvec->bv_offset;
-		u32 len = bvec->bv_len;
+		u64 start = folio_pos(folio) + fi.offset;
+		u32 len = fi.length;
+
+		/* Only order 0 (single page) folios are allowed for data. */
+		ASSERT(folio_order(folio) == 0);
 
 		/* Our read/write should always be sector aligned. */
-		if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
+		if (!IS_ALIGNED(fi.offset, sectorsize))
 			btrfs_err(fs_info,
-		"partial page write in btrfs with offset %u and length %u",
-				  bvec->bv_offset, bvec->bv_len);
-		else if (!IS_ALIGNED(bvec->bv_len, sectorsize))
+		"partial page write in btrfs with offset %zu and length %zu",
+				  fi.offset, fi.length);
+		else if (!IS_ALIGNED(fi.length, sectorsize))
 			btrfs_info(fs_info,
-		"incomplete page write with offset %u and length %u",
-				   bvec->bv_offset, bvec->bv_len);
+		"incomplete page write with offset %zu and length %zu",
+				   fi.offset, fi.length);
 
-		btrfs_finish_ordered_extent(bbio->ordered, page, start, len, !error);
+		btrfs_finish_ordered_extent(bbio->ordered,
+				folio_page(folio, 0), start, len, !error);
 		if (error)
-			mapping_set_error(page->mapping, error);
-		btrfs_page_clear_writeback(fs_info, page, start, len);
+			mapping_set_error(folio->mapping, error);
+		btrfs_folio_clear_writeback(fs_info, folio, start, len);
 	}
 
 	bio_put(bio);
@@ -562,98 +569,102 @@ update:
 
 static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
 {
-	ASSERT(PageLocked(page));
-	if (!btrfs_is_subpage(fs_info, page))
+	struct folio *folio = page_folio(page);
+
+	ASSERT(folio_test_locked(folio));
+	if (!btrfs_is_subpage(fs_info, folio->mapping))
 		return;
 
-	ASSERT(PagePrivate(page));
-	btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE);
+	ASSERT(folio_test_private(folio));
+	btrfs_subpage_start_reader(fs_info, folio, page_offset(page), PAGE_SIZE);
 }
 
 /*
- * after a readpage IO is done, we need to:
- * clear the uptodate bits on error
- * set the uptodate bits if things worked
- * set the page up to date if all extents in the tree are uptodate
- * clear the lock bit in the extent tree
- * unlock the page if there are no other extents locked for it
+ * After a data read IO is done, we need to:
+ *
+ * - clear the uptodate bits on error
+ * - set the uptodate bits if things worked
+ * - set the folio up to date if all extents in the tree are uptodate
+ * - clear the lock bit in the extent tree
+ * - unlock the folio if there are no other extents locked for it
  *
  * Scheduling is not allowed, so the extent state tree is expected
  * to have one and only one object corresponding to this IO.
  */
-static void end_bio_extent_readpage(struct btrfs_bio *bbio)
+static void end_bbio_data_read(struct btrfs_bio *bbio)
 {
 	struct bio *bio = &bbio->bio;
-	struct bio_vec *bvec;
 	struct processed_extent processed = { 0 };
+	struct folio_iter fi;
 	/*
 	 * The offset to the beginning of a bio, since one bio can never be
 	 * larger than UINT_MAX, u32 here is enough.
 	 */
 	u32 bio_offset = 0;
-	struct bvec_iter_all iter_all;
 
 	ASSERT(!bio_flagged(bio, BIO_CLONED));
-	bio_for_each_segment_all(bvec, bio, iter_all) {
+	bio_for_each_folio_all(fi, &bbio->bio) {
 		bool uptodate = !bio->bi_status;
-		struct page *page = bvec->bv_page;
-		struct inode *inode = page->mapping->host;
+		struct folio *folio = fi.folio;
+		struct inode *inode = folio->mapping->host;
 		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 		const u32 sectorsize = fs_info->sectorsize;
 		u64 start;
 		u64 end;
 		u32 len;
 
+		/* For now only order 0 folios are supported for data. */
+		ASSERT(folio_order(folio) == 0);
 		btrfs_debug(fs_info,
-			"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
-			bio->bi_iter.bi_sector, bio->bi_status,
+			"%s: bi_sector=%llu, err=%d, mirror=%u",
+			__func__, bio->bi_iter.bi_sector, bio->bi_status,
 			bbio->mirror_num);
 
 		/*
 		 * We always issue full-sector reads, but if some block in a
-		 * page fails to read, blk_update_request() will advance
+		 * folio fails to read, blk_update_request() will advance
 		 * bv_offset and adjust bv_len to compensate.  Print a warning
 		 * for unaligned offsets, and an error if they don't add up to
 		 * a full sector.
 		 */
-		if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
+		if (!IS_ALIGNED(fi.offset, sectorsize))
 			btrfs_err(fs_info,
-		"partial page read in btrfs with offset %u and length %u",
-				  bvec->bv_offset, bvec->bv_len);
-		else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len,
-				     sectorsize))
+		"partial page read in btrfs with offset %zu and length %zu",
+				  fi.offset, fi.length);
+		else if (!IS_ALIGNED(fi.offset + fi.length, sectorsize))
 			btrfs_info(fs_info,
-		"incomplete page read with offset %u and length %u",
-				   bvec->bv_offset, bvec->bv_len);
+		"incomplete page read with offset %zu and length %zu",
+				   fi.offset, fi.length);
 
-		start = page_offset(page) + bvec->bv_offset;
-		end = start + bvec->bv_len - 1;
-		len = bvec->bv_len;
+		start = folio_pos(folio) + fi.offset;
+		end = start + fi.length - 1;
+		len = fi.length;
 
 		if (likely(uptodate)) {
 			loff_t i_size = i_size_read(inode);
-			pgoff_t end_index = i_size >> PAGE_SHIFT;
+			pgoff_t end_index = i_size >> folio_shift(folio);
 
 			/*
 			 * Zero out the remaining part if this range straddles
 			 * i_size.
 			 *
-			 * Here we should only zero the range inside the bvec,
+			 * Here we should only zero the range inside the folio,
 			 * not touch anything else.
 			 *
 			 * NOTE: i_size is exclusive while end is inclusive.
 			 */
-			if (page->index == end_index && i_size <= end) {
-				u32 zero_start = max(offset_in_page(i_size),
-						     offset_in_page(start));
+			if (folio_index(folio) == end_index && i_size <= end) {
+				u32 zero_start = max(offset_in_folio(folio, i_size),
+						     offset_in_folio(folio, start));
+				u32 zero_len = offset_in_folio(folio, end) + 1 -
+					       zero_start;
 
-				zero_user_segment(page, zero_start,
-						  offset_in_page(end) + 1);
+				folio_zero_range(folio, zero_start, zero_len);
 			}
 		}
 
 		/* Update page status and unlock. */
-		end_page_read(page, uptodate, start, len);
+		end_page_read(folio_page(folio, 0), uptodate, start, len);
 		endio_readpage_release_extent(&processed, BTRFS_I(inode),
 					      start, end, uptodate);
 
@@ -672,19 +683,22 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)
  * @nr_pages:   number of pages to allocate
  * @page_array: the array to fill with pages; any existing non-null entries in
  * 		the array will be skipped
+ * @extra_gfp:	the extra GFP flags for the allocation.
  *
  * Return: 0        if all pages were able to be allocated;
  *         -ENOMEM  otherwise, the partially allocated pages would be freed and
  *                  the array slots zeroed
  */
-int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
+int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
+			   gfp_t extra_gfp)
 {
 	unsigned int allocated;
 
 	for (allocated = 0; allocated < nr_pages;) {
 		unsigned int last = allocated;
 
-		allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array);
+		allocated = alloc_pages_bulk_array(GFP_NOFS | extra_gfp,
+						   nr_pages, page_array);
 
 		if (allocated == nr_pages)
 			return 0;
@@ -707,6 +721,26 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
 	return 0;
 }
 
+/*
+ * Populate needed folios for the extent buffer.
+ *
+ * For now, the folios populated are always in order 0 (aka, single page).
+ */
+static int alloc_eb_folio_array(struct extent_buffer *eb, gfp_t extra_gfp)
+{
+	struct page *page_array[INLINE_EXTENT_BUFFER_PAGES] = { 0 };
+	int num_pages = num_extent_pages(eb);
+	int ret;
+
+	ret = btrfs_alloc_page_array(num_pages, page_array, extra_gfp);
+	if (ret < 0)
+		return ret;
+
+	for (int i = 0; i < num_pages; i++)
+		eb->folios[i] = page_folio(page_array[i]);
+	return 0;
+}
+
 static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl,
 				struct page *page, u64 disk_bytenr,
 				unsigned int pg_offset)
@@ -861,9 +895,9 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl,
 	} while (size);
 }
 
-static int attach_extent_buffer_page(struct extent_buffer *eb,
-				     struct page *page,
-				     struct btrfs_subpage *prealloc)
+static int attach_extent_buffer_folio(struct extent_buffer *eb,
+				      struct folio *folio,
+				      struct btrfs_subpage *prealloc)
 {
 	struct btrfs_fs_info *fs_info = eb->fs_info;
 	int ret = 0;
@@ -874,65 +908,66 @@ static int attach_extent_buffer_page(struct extent_buffer *eb,
 	 * For cloned or dummy extent buffers, their pages are not mapped and
 	 * will not race with any other ebs.
 	 */
-	if (page->mapping)
-		lockdep_assert_held(&page->mapping->i_private_lock);
+	if (folio->mapping)
+		lockdep_assert_held(&folio->mapping->i_private_lock);
 
 	if (fs_info->nodesize >= PAGE_SIZE) {
-		if (!PagePrivate(page))
-			attach_page_private(page, eb);
+		if (!folio_test_private(folio))
+			folio_attach_private(folio, eb);
 		else
-			WARN_ON(page->private != (unsigned long)eb);
+			WARN_ON(folio_get_private(folio) != eb);
 		return 0;
 	}
 
 	/* Already mapped, just free prealloc */
-	if (PagePrivate(page)) {
+	if (folio_test_private(folio)) {
 		btrfs_free_subpage(prealloc);
 		return 0;
 	}
 
 	if (prealloc)
 		/* Has preallocated memory for subpage */
-		attach_page_private(page, prealloc);
+		folio_attach_private(folio, prealloc);
 	else
 		/* Do new allocation to attach subpage */
-		ret = btrfs_attach_subpage(fs_info, page,
-					   BTRFS_SUBPAGE_METADATA);
+		ret = btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA);
 	return ret;
 }
 
 int set_page_extent_mapped(struct page *page)
 {
+	struct folio *folio = page_folio(page);
 	struct btrfs_fs_info *fs_info;
 
 	ASSERT(page->mapping);
 
-	if (PagePrivate(page))
+	if (folio_test_private(folio))
 		return 0;
 
 	fs_info = btrfs_sb(page->mapping->host->i_sb);
 
-	if (btrfs_is_subpage(fs_info, page))
-		return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA);
+	if (btrfs_is_subpage(fs_info, page->mapping))
+		return btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA);
 
-	attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
+	folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE);
 	return 0;
 }
 
 void clear_page_extent_mapped(struct page *page)
 {
+	struct folio *folio = page_folio(page);
 	struct btrfs_fs_info *fs_info;
 
 	ASSERT(page->mapping);
 
-	if (!PagePrivate(page))
+	if (!folio_test_private(folio))
 		return;
 
 	fs_info = btrfs_sb(page->mapping->host->i_sb);
-	if (btrfs_is_subpage(fs_info, page))
-		return btrfs_detach_subpage(fs_info, page);
+	if (btrfs_is_subpage(fs_info, page->mapping))
+		return btrfs_detach_subpage(fs_info, folio);
 
-	detach_page_private(page);
+	folio_detach_private(folio);
 }
 
 static struct extent_map *
@@ -1001,7 +1036,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 			memzero_page(page, zero_offset, iosize);
 		}
 	}
-	bio_ctrl->end_io_func = end_bio_extent_readpage;
+	bio_ctrl->end_io_func = end_bbio_data_read;
 	begin_page_read(fs_info, page);
 	while (cur <= end) {
 		enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE;
@@ -1027,8 +1062,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 		BUG_ON(extent_map_end(em) <= cur);
 		BUG_ON(end < cur);
 
-		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
-			compress_type = em->compress_type;
+		compress_type = extent_map_compression(em);
 
 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
 		iosize = ALIGN(iosize, blocksize);
@@ -1037,7 +1071,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 		else
 			disk_bytenr = em->block_start + extent_offset;
 		block_start = em->block_start;
-		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+		if (em->flags & EXTENT_FLAG_PREALLOC)
 			block_start = EXTENT_MAP_HOLE;
 
 		/*
@@ -1074,7 +1108,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 		 * is a corner case so we prioritize correctness over
 		 * non-optimal behavior (submitting 2 bios for the same extent).
 		 */
-		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
+		if (compress_type != BTRFS_COMPRESS_NONE &&
 		    prev_em_start && *prev_em_start != (u64)-1 &&
 		    *prev_em_start != em->start)
 			force_bio_submit = true;
@@ -1240,7 +1274,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
 				 struct page *page, u64 *start, u64 *end)
 {
-	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+	struct folio *folio = page_folio(page);
+	struct btrfs_subpage *subpage = folio_get_private(folio);
 	struct btrfs_subpage_info *spi = fs_info->subpage_info;
 	u64 orig_start = *start;
 	/* Declare as unsigned long so we can use bitmap ops */
@@ -1252,7 +1287,7 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
 	 * For regular sector size == page size case, since one page only
 	 * contains one sector, we return the page offset directly.
 	 */
-	if (!btrfs_is_subpage(fs_info, page)) {
+	if (!btrfs_is_subpage(fs_info, page->mapping)) {
 		*start = page_offset(page);
 		*end = page_offset(page) + PAGE_SIZE;
 		return;
@@ -1305,7 +1340,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
 		return 1;
 	}
 
-	bio_ctrl->end_io_func = end_bio_extent_writepage;
+	bio_ctrl->end_io_func = end_bbio_data_write;
 	while (cur <= end) {
 		u32 len = end - cur + 1;
 		u64 disk_bytenr;
@@ -1325,7 +1360,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
 			 * writeback the sectors with subpage dirty bits,
 			 * causing writeback without ordered extent.
 			 */
-			btrfs_page_clear_dirty(fs_info, page, cur, len);
+			btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, len);
 			break;
 		}
 
@@ -1352,7 +1387,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
 		block_start = em->block_start;
 		disk_bytenr = em->block_start + extent_offset;
 
-		ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
+		ASSERT(!extent_map_is_compressed(em));
 		ASSERT(block_start != EXTENT_MAP_HOLE);
 		ASSERT(block_start != EXTENT_MAP_INLINE);
 
@@ -1377,7 +1412,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
 		 * So clear subpage dirty bit here so next time we won't submit
 		 * page for range already written to disk.
 		 */
-		btrfs_page_clear_dirty(fs_info, page, cur, iosize);
+		btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, iosize);
 
 		submit_extent_page(bio_ctrl, disk_bytenr, page, iosize,
 				   cur - page_offset(page));
@@ -1385,7 +1420,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
 		nr++;
 	}
 
-	btrfs_page_assert_not_dirty(fs_info, page);
+	btrfs_folio_assert_not_dirty(fs_info, page_folio(page));
 	*nr_ret = nr;
 	return 0;
 
@@ -1607,24 +1642,23 @@ static struct extent_buffer *find_extent_buffer_nolock(
 	return NULL;
 }
 
-static void extent_buffer_write_end_io(struct btrfs_bio *bbio)
+static void end_bbio_meta_write(struct btrfs_bio *bbio)
 {
 	struct extent_buffer *eb = bbio->private;
 	struct btrfs_fs_info *fs_info = eb->fs_info;
 	bool uptodate = !bbio->bio.bi_status;
-	struct bvec_iter_all iter_all;
-	struct bio_vec *bvec;
+	struct folio_iter fi;
 	u32 bio_offset = 0;
 
 	if (!uptodate)
 		set_btree_ioerr(eb);
 
-	bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
+	bio_for_each_folio_all(fi, &bbio->bio) {
 		u64 start = eb->start + bio_offset;
-		struct page *page = bvec->bv_page;
-		u32 len = bvec->bv_len;
+		struct folio *folio = fi.folio;
+		u32 len = fi.length;
 
-		btrfs_page_clear_writeback(fs_info, page, start, len);
+		btrfs_folio_clear_writeback(fs_info, folio, start, len);
 		bio_offset += len;
 	}
 
@@ -1673,36 +1707,44 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
 
 	bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
 			       REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc),
-			       eb->fs_info, extent_buffer_write_end_io, eb);
+			       eb->fs_info, end_bbio_meta_write, eb);
 	bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
 	bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev);
 	wbc_init_bio(wbc, &bbio->bio);
 	bbio->inode = BTRFS_I(eb->fs_info->btree_inode);
 	bbio->file_offset = eb->start;
 	if (fs_info->nodesize < PAGE_SIZE) {
-		struct page *p = eb->pages[0];
+		struct folio *folio = eb->folios[0];
+		bool ret;
 
-		lock_page(p);
-		btrfs_subpage_set_writeback(fs_info, p, eb->start, eb->len);
-		if (btrfs_subpage_clear_and_test_dirty(fs_info, p, eb->start,
+		folio_lock(folio);
+		btrfs_subpage_set_writeback(fs_info, folio, eb->start, eb->len);
+		if (btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start,
 						       eb->len)) {
-			clear_page_dirty_for_io(p);
+			folio_clear_dirty_for_io(folio);
 			wbc->nr_to_write--;
 		}
-		__bio_add_page(&bbio->bio, p, eb->len, eb->start - page_offset(p));
-		wbc_account_cgroup_owner(wbc, p, eb->len);
-		unlock_page(p);
+		ret = bio_add_folio(&bbio->bio, folio, eb->len,
+				    eb->start - folio_pos(folio));
+		ASSERT(ret);
+		wbc_account_cgroup_owner(wbc, folio_page(folio, 0), eb->len);
+		folio_unlock(folio);
 	} else {
-		for (int i = 0; i < num_extent_pages(eb); i++) {
-			struct page *p = eb->pages[i];
-
-			lock_page(p);
-			clear_page_dirty_for_io(p);
-			set_page_writeback(p);
-			__bio_add_page(&bbio->bio, p, PAGE_SIZE, 0);
-			wbc_account_cgroup_owner(wbc, p, PAGE_SIZE);
-			wbc->nr_to_write--;
-			unlock_page(p);
+		int num_folios = num_extent_folios(eb);
+
+		for (int i = 0; i < num_folios; i++) {
+			struct folio *folio = eb->folios[i];
+			bool ret;
+
+			folio_lock(folio);
+			folio_clear_dirty_for_io(folio);
+			folio_start_writeback(folio);
+			ret = bio_add_folio(&bbio->bio, folio, folio_size(folio), 0);
+			ASSERT(ret);
+			wbc_account_cgroup_owner(wbc, folio_page(folio, 0),
+						 folio_size(folio));
+			wbc->nr_to_write -= folio_nr_pages(folio);
+			folio_unlock(folio);
 		}
 	}
 	btrfs_submit_bio(bbio, 0);
@@ -1725,6 +1767,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
 static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+	struct folio *folio = page_folio(page);
 	int submitted = 0;
 	u64 page_start = page_offset(page);
 	int bit_start = 0;
@@ -1732,7 +1775,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
 
 	/* Lock and write each dirty extent buffers in the range */
 	while (bit_start < fs_info->subpage_info->bitmap_nr_bits) {
-		struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+		struct btrfs_subpage *subpage = folio_get_private(folio);
 		struct extent_buffer *eb;
 		unsigned long flags;
 		u64 start;
@@ -1742,7 +1785,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
 		 * in the meantime.
 		 */
 		spin_lock(&page->mapping->i_private_lock);
-		if (!PagePrivate(page)) {
+		if (!folio_test_private(folio)) {
 			spin_unlock(&page->mapping->i_private_lock);
 			break;
 		}
@@ -1807,22 +1850,23 @@ static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx)
 {
 	struct writeback_control *wbc = ctx->wbc;
 	struct address_space *mapping = page->mapping;
+	struct folio *folio = page_folio(page);
 	struct extent_buffer *eb;
 	int ret;
 
-	if (!PagePrivate(page))
+	if (!folio_test_private(folio))
 		return 0;
 
 	if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
 		return submit_eb_subpage(page, wbc);
 
 	spin_lock(&mapping->i_private_lock);
-	if (!PagePrivate(page)) {
+	if (!folio_test_private(folio)) {
 		spin_unlock(&mapping->i_private_lock);
 		return 0;
 	}
 
-	eb = (struct extent_buffer *)page->private;
+	eb = folio_get_private(folio);
 
 	/*
 	 * Shouldn't happen and normally this would be a BUG_ON but no point
@@ -2201,7 +2245,7 @@ void extent_write_locked_range(struct inode *inode, struct page *locked_page,
 						       cur, cur_len, !ret);
 			mapping_set_error(page->mapping, ret);
 		}
-		btrfs_page_unlock_writer(fs_info, page, cur, cur_len);
+		btrfs_folio_unlock_writer(fs_info, page_folio(page), cur, cur_len);
 		if (ret < 0)
 			found_error = true;
 next_page:
@@ -2352,7 +2396,7 @@ int try_release_extent_mapping(struct page *page, gfp_t mask)
 				write_unlock(&map->lock);
 				break;
 			}
-			if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
+			if ((em->flags & EXTENT_FLAG_PINNED) ||
 			    em->start != start) {
 				write_unlock(&map->lock);
 				free_extent_map(em);
@@ -2369,7 +2413,7 @@ int try_release_extent_mapping(struct page *page, gfp_t mask)
 			 * extra reference on the em.
 			 */
 			if (list_empty(&em->list) ||
-			    test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+			    (em->flags & EXTENT_FLAG_LOGGING))
 				goto remove_em;
 			/*
 			 * If it's in the list of modified extents, remove it
@@ -3058,14 +3102,14 @@ static int extent_buffer_under_io(const struct extent_buffer *eb)
 		test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
 }
 
-static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
+static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *folio)
 {
 	struct btrfs_subpage *subpage;
 
-	lockdep_assert_held(&page->mapping->i_private_lock);
+	lockdep_assert_held(&folio->mapping->i_private_lock);
 
-	if (PagePrivate(page)) {
-		subpage = (struct btrfs_subpage *)page->private;
+	if (folio_test_private(folio)) {
+		subpage = folio_get_private(folio);
 		if (atomic_read(&subpage->eb_refs))
 			return true;
 		/*
@@ -3078,21 +3122,21 @@ static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
 	return false;
 }
 
-static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
+static void detach_extent_buffer_folio(struct extent_buffer *eb, struct folio *folio)
 {
 	struct btrfs_fs_info *fs_info = eb->fs_info;
 	const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
 
 	/*
-	 * For mapped eb, we're going to change the page private, which should
+	 * For mapped eb, we're going to change the folio private, which should
 	 * be done under the i_private_lock.
 	 */
 	if (mapped)
-		spin_lock(&page->mapping->i_private_lock);
+		spin_lock(&folio->mapping->i_private_lock);
 
-	if (!PagePrivate(page)) {
+	if (!folio_test_private(folio)) {
 		if (mapped)
-			spin_unlock(&page->mapping->i_private_lock);
+			spin_unlock(&folio->mapping->i_private_lock);
 		return;
 	}
 
@@ -3101,66 +3145,58 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag
 		 * We do this since we'll remove the pages after we've
 		 * removed the eb from the radix tree, so we could race
 		 * and have this page now attached to the new eb.  So
-		 * only clear page_private if it's still connected to
+		 * only clear folio if it's still connected to
 		 * this eb.
 		 */
-		if (PagePrivate(page) &&
-		    page->private == (unsigned long)eb) {
+		if (folio_test_private(folio) && folio_get_private(folio) == eb) {
 			BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
-			BUG_ON(PageDirty(page));
-			BUG_ON(PageWriteback(page));
-			/*
-			 * We need to make sure we haven't be attached
-			 * to a new eb.
-			 */
-			detach_page_private(page);
+			BUG_ON(folio_test_dirty(folio));
+			BUG_ON(folio_test_writeback(folio));
+			/* We need to make sure we haven't be attached to a new eb. */
+			folio_detach_private(folio);
 		}
 		if (mapped)
-			spin_unlock(&page->mapping->i_private_lock);
+			spin_unlock(&folio->mapping->i_private_lock);
 		return;
 	}
 
 	/*
-	 * For subpage, we can have dummy eb with page private.  In this case,
-	 * we can directly detach the private as such page is only attached to
-	 * one dummy eb, no sharing.
+	 * For subpage, we can have dummy eb with folio private attached.  In
+	 * this case, we can directly detach the private as such folio is only
+	 * attached to one dummy eb, no sharing.
 	 */
 	if (!mapped) {
-		btrfs_detach_subpage(fs_info, page);
+		btrfs_detach_subpage(fs_info, folio);
 		return;
 	}
 
-	btrfs_page_dec_eb_refs(fs_info, page);
+	btrfs_folio_dec_eb_refs(fs_info, folio);
 
 	/*
-	 * We can only detach the page private if there are no other ebs in the
+	 * We can only detach the folio private if there are no other ebs in the
 	 * page range and no unfinished IO.
 	 */
-	if (!page_range_has_eb(fs_info, page))
-		btrfs_detach_subpage(fs_info, page);
+	if (!folio_range_has_eb(fs_info, folio))
+		btrfs_detach_subpage(fs_info, folio);
 
-	spin_unlock(&page->mapping->i_private_lock);
+	spin_unlock(&folio->mapping->i_private_lock);
 }
 
 /* Release all pages attached to the extent buffer */
 static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
 {
-	int i;
-	int num_pages;
-
 	ASSERT(!extent_buffer_under_io(eb));
 
-	num_pages = num_extent_pages(eb);
-	for (i = 0; i < num_pages; i++) {
-		struct page *page = eb->pages[i];
+	for (int i = 0; i < INLINE_EXTENT_BUFFER_PAGES; i++) {
+		struct folio *folio = eb->folios[i];
 
-		if (!page)
+		if (!folio)
 			continue;
 
-		detach_extent_buffer_page(eb, page);
+		detach_extent_buffer_folio(eb, folio);
 
-		/* One for when we allocated the page */
-		put_page(page);
+		/* One for when we allocated the folio. */
+		folio_put(folio);
 	}
 }
 
@@ -3198,9 +3234,8 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
 
 struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
 {
-	int i;
 	struct extent_buffer *new;
-	int num_pages = num_extent_pages(src);
+	int num_folios = num_extent_folios(src);
 	int ret;
 
 	new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
@@ -3214,22 +3249,22 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
 	 */
 	set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
 
-	ret = btrfs_alloc_page_array(num_pages, new->pages);
+	ret = alloc_eb_folio_array(new, 0);
 	if (ret) {
 		btrfs_release_extent_buffer(new);
 		return NULL;
 	}
 
-	for (i = 0; i < num_pages; i++) {
+	for (int i = 0; i < num_folios; i++) {
+		struct folio *folio = new->folios[i];
 		int ret;
-		struct page *p = new->pages[i];
 
-		ret = attach_extent_buffer_page(new, p, NULL);
+		ret = attach_extent_buffer_folio(new, folio, NULL);
 		if (ret < 0) {
 			btrfs_release_extent_buffer(new);
 			return NULL;
 		}
-		WARN_ON(PageDirty(p));
+		WARN_ON(folio_test_dirty(folio));
 	}
 	copy_extent_buffer_full(new, src);
 	set_extent_buffer_uptodate(new);
@@ -3241,23 +3276,20 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
 						  u64 start, unsigned long len)
 {
 	struct extent_buffer *eb;
-	int num_pages;
-	int i;
+	int num_folios = 0;
 	int ret;
 
 	eb = __alloc_extent_buffer(fs_info, start, len);
 	if (!eb)
 		return NULL;
 
-	num_pages = num_extent_pages(eb);
-	ret = btrfs_alloc_page_array(num_pages, eb->pages);
+	ret = alloc_eb_folio_array(eb, 0);
 	if (ret)
 		goto err;
 
-	for (i = 0; i < num_pages; i++) {
-		struct page *p = eb->pages[i];
-
-		ret = attach_extent_buffer_page(eb, p, NULL);
+	num_folios = num_extent_folios(eb);
+	for (int i = 0; i < num_folios; i++) {
+		ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL);
 		if (ret < 0)
 			goto err;
 	}
@@ -3268,10 +3300,10 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
 
 	return eb;
 err:
-	for (i = 0; i < num_pages; i++) {
-		if (eb->pages[i]) {
-			detach_extent_buffer_page(eb, eb->pages[i]);
-			__free_page(eb->pages[i]);
+	for (int i = 0; i < num_folios; i++) {
+		if (eb->folios[i]) {
+			detach_extent_buffer_folio(eb, eb->folios[i]);
+			__folio_put(eb->folios[i]);
 		}
 	}
 	__free_extent_buffer(eb);
@@ -3320,20 +3352,14 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
 	spin_unlock(&eb->refs_lock);
 }
 
-static void mark_extent_buffer_accessed(struct extent_buffer *eb,
-		struct page *accessed)
+static void mark_extent_buffer_accessed(struct extent_buffer *eb)
 {
-	int num_pages, i;
+	int num_folios= num_extent_folios(eb);
 
 	check_buffer_tree_ref(eb);
 
-	num_pages = num_extent_pages(eb);
-	for (i = 0; i < num_pages; i++) {
-		struct page *p = eb->pages[i];
-
-		if (p != accessed)
-			mark_page_accessed(p);
-	}
+	for (int i = 0; i < num_folios; i++)
+		folio_mark_accessed(eb->folios[i]);
 }
 
 struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
@@ -3361,7 +3387,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
 		spin_lock(&eb->refs_lock);
 		spin_unlock(&eb->refs_lock);
 	}
-	mark_extent_buffer_accessed(eb, NULL);
+	mark_extent_buffer_accessed(eb);
 	return eb;
 }
 
@@ -3410,6 +3436,7 @@ free_eb:
 static struct extent_buffer *grab_extent_buffer(
 		struct btrfs_fs_info *fs_info, struct page *page)
 {
+	struct folio *folio = page_folio(page);
 	struct extent_buffer *exists;
 
 	/*
@@ -3421,21 +3448,21 @@ static struct extent_buffer *grab_extent_buffer(
 		return NULL;
 
 	/* Page not yet attached to an extent buffer */
-	if (!PagePrivate(page))
+	if (!folio_test_private(folio))
 		return NULL;
 
 	/*
 	 * We could have already allocated an eb for this page and attached one
 	 * so lets see if we can get a ref on the existing eb, and if we can we
 	 * know it's good and we can just return that one, else we know we can
-	 * just overwrite page->private.
+	 * just overwrite folio private.
 	 */
-	exists = (struct extent_buffer *)page->private;
+	exists = folio_get_private(folio);
 	if (atomic_inc_not_zero(&exists->refs))
 		return exists;
 
 	WARN_ON(PageDirty(page));
-	detach_page_private(page);
+	folio_detach_private(folio);
 	return NULL;
 }
 
@@ -3469,19 +3496,88 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
 	return 0;
 }
 
+
+/*
+ * Return 0 if eb->folios[i] is attached to btree inode successfully.
+ * Return >0 if there is already another extent buffer for the range,
+ * and @found_eb_ret would be updated.
+ * Return -EAGAIN if the filemap has an existing folio but with different size
+ * than @eb.
+ * The caller needs to free the existing folios and retry using the same order.
+ */
+static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
+				      struct extent_buffer **found_eb_ret)
+{
+
+	struct btrfs_fs_info *fs_info = eb->fs_info;
+	struct address_space *mapping = fs_info->btree_inode->i_mapping;
+	const unsigned long index = eb->start >> PAGE_SHIFT;
+	struct folio *existing_folio;
+	int ret;
+
+	ASSERT(found_eb_ret);
+
+	/* Caller should ensure the folio exists. */
+	ASSERT(eb->folios[i]);
+
+retry:
+	ret = filemap_add_folio(mapping, eb->folios[i], index + i,
+				GFP_NOFS | __GFP_NOFAIL);
+	if (!ret)
+		return 0;
+
+	existing_folio = filemap_lock_folio(mapping, index + i);
+	/* The page cache only exists for a very short time, just retry. */
+	if (IS_ERR(existing_folio))
+		goto retry;
+
+	/* For now, we should only have single-page folios for btree inode. */
+	ASSERT(folio_nr_pages(existing_folio) == 1);
+
+	if (folio_size(existing_folio) != folio_size(eb->folios[0])) {
+		folio_unlock(existing_folio);
+		folio_put(existing_folio);
+		return -EAGAIN;
+	}
+
+	if (fs_info->nodesize < PAGE_SIZE) {
+		/*
+		 * We're going to reuse the existing page, can drop our page
+		 * and subpage structure now.
+		 */
+		__free_page(folio_page(eb->folios[i], 0));
+		eb->folios[i] = existing_folio;
+	} else {
+		struct extent_buffer *existing_eb;
+
+		existing_eb = grab_extent_buffer(fs_info,
+						 folio_page(existing_folio, 0));
+		if (existing_eb) {
+			/* The extent buffer still exists, we can use it directly. */
+			*found_eb_ret = existing_eb;
+			folio_unlock(existing_folio);
+			folio_put(existing_folio);
+			return 1;
+		}
+		/* The extent buffer no longer exists, we can reuse the folio. */
+		__free_page(folio_page(eb->folios[i], 0));
+		eb->folios[i] = existing_folio;
+	}
+	return 0;
+}
+
 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 					  u64 start, u64 owner_root, int level)
 {
 	unsigned long len = fs_info->nodesize;
-	int num_pages;
-	int i;
-	unsigned long index = start >> PAGE_SHIFT;
+	int num_folios;
+	int attached = 0;
 	struct extent_buffer *eb;
-	struct extent_buffer *exists = NULL;
-	struct page *p;
+	struct extent_buffer *existing_eb = NULL;
 	struct address_space *mapping = fs_info->btree_inode->i_mapping;
 	struct btrfs_subpage *prealloc = NULL;
 	u64 lockdep_owner = owner_root;
+	bool page_contig = true;
 	int uptodate = 1;
 	int ret;
 
@@ -3516,10 +3612,8 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 
 	btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level);
 
-	num_pages = num_extent_pages(eb);
-
 	/*
-	 * Preallocate page->private for subpage case, so that we won't
+	 * Preallocate folio private for subpage case, so that we won't
 	 * allocate memory with i_private_lock nor page lock hold.
 	 *
 	 * The memory will be freed by attach_extent_buffer_page() or freed
@@ -3528,47 +3622,89 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 	if (fs_info->nodesize < PAGE_SIZE) {
 		prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
 		if (IS_ERR(prealloc)) {
-			exists = ERR_CAST(prealloc);
-			goto free_eb;
+			ret = PTR_ERR(prealloc);
+			goto out;
 		}
 	}
 
-	for (i = 0; i < num_pages; i++, index++) {
-		p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
-		if (!p) {
-			exists = ERR_PTR(-ENOMEM);
-			btrfs_free_subpage(prealloc);
-			goto free_eb;
+reallocate:
+	/* Allocate all pages first. */
+	ret = alloc_eb_folio_array(eb, __GFP_NOFAIL);
+	if (ret < 0) {
+		btrfs_free_subpage(prealloc);
+		goto out;
+	}
+
+	num_folios = num_extent_folios(eb);
+	/* Attach all pages to the filemap. */
+	for (int i = 0; i < num_folios; i++) {
+		struct folio *folio;
+
+		ret = attach_eb_folio_to_filemap(eb, i, &existing_eb);
+		if (ret > 0) {
+			ASSERT(existing_eb);
+			goto out;
 		}
 
-		spin_lock(&mapping->i_private_lock);
-		exists = grab_extent_buffer(fs_info, p);
-		if (exists) {
-			spin_unlock(&mapping->i_private_lock);
-			unlock_page(p);
-			put_page(p);
-			mark_extent_buffer_accessed(exists, p);
-			btrfs_free_subpage(prealloc);
-			goto free_eb;
+		/*
+		 * TODO: Special handling for a corner case where the order of
+		 * folios mismatch between the new eb and filemap.
+		 *
+		 * This happens when:
+		 *
+		 * - the new eb is using higher order folio
+		 *
+		 * - the filemap is still using 0-order folios for the range
+		 *   This can happen at the previous eb allocation, and we don't
+		 *   have higher order folio for the call.
+		 *
+		 * - the existing eb has already been freed
+		 *
+		 * In this case, we have to free the existing folios first, and
+		 * re-allocate using the same order.
+		 * Thankfully this is not going to happen yet, as we're still
+		 * using 0-order folios.
+		 */
+		if (unlikely(ret == -EAGAIN)) {
+			ASSERT(0);
+			goto reallocate;
 		}
+		attached++;
+
+		/*
+		 * Only after attach_eb_folio_to_filemap(), eb->folios[] is
+		 * reliable, as we may choose to reuse the existing page cache
+		 * and free the allocated page.
+		 */
+		folio = eb->folios[i];
+		spin_lock(&mapping->i_private_lock);
 		/* Should not fail, as we have preallocated the memory */
-		ret = attach_extent_buffer_page(eb, p, prealloc);
+		ret = attach_extent_buffer_folio(eb, folio, prealloc);
 		ASSERT(!ret);
 		/*
 		 * To inform we have extra eb under allocation, so that
-		 * detach_extent_buffer_page() won't release the page private
+		 * detach_extent_buffer_page() won't release the folio private
 		 * when the eb hasn't yet been inserted into radix tree.
 		 *
 		 * The ref will be decreased when the eb released the page, in
 		 * detach_extent_buffer_page().
 		 * Thus needs no special handling in error path.
 		 */
-		btrfs_page_inc_eb_refs(fs_info, p);
+		btrfs_folio_inc_eb_refs(fs_info, folio);
 		spin_unlock(&mapping->i_private_lock);
 
-		WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len));
-		eb->pages[i] = p;
-		if (!btrfs_page_test_uptodate(fs_info, p, eb->start, eb->len))
+		WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len));
+
+		/*
+		 * Check if the current page is physically contiguous with previous eb
+		 * page.
+		 * At this stage, either we allocated a large folio, thus @i
+		 * would only be 0, or we fall back to per-page allocation.
+		 */
+		if (i && folio_page(eb->folios[i - 1], 0) + 1 != folio_page(folio, 0))
+			page_contig = false;
+
+		if (!btrfs_folio_test_uptodate(fs_info, folio, eb->start, eb->len))
 			uptodate = 0;
 
 		/*
@@ -3581,12 +3717,13 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 	}
 	if (uptodate)
 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+	/* All pages are physically contiguous, can skip cross page handling. */
+	if (page_contig)
+		eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start);
 again:
 	ret = radix_tree_preload(GFP_NOFS);
-	if (ret) {
-		exists = ERR_PTR(ret);
-		goto free_eb;
-	}
+	if (ret)
+		goto out;
 
 	spin_lock(&fs_info->buffer_lock);
 	ret = radix_tree_insert(&fs_info->buffer_radix,
@@ -3594,9 +3731,10 @@ again:
 	spin_unlock(&fs_info->buffer_lock);
 	radix_tree_preload_end();
 	if (ret == -EEXIST) {
-		exists = find_extent_buffer(fs_info, start);
-		if (exists)
-			goto free_eb;
+		ret = 0;
+		existing_eb = find_extent_buffer(fs_info, start);
+		if (existing_eb)
+			goto out;
 		else
 			goto again;
 	}
@@ -3609,19 +3747,46 @@ again:
 	 * btree_release_folio will correctly detect that a page belongs to a
 	 * live buffer and won't free them prematurely.
 	 */
-	for (i = 0; i < num_pages; i++)
-		unlock_page(eb->pages[i]);
+	for (int i = 0; i < num_folios; i++)
+		unlock_page(folio_page(eb->folios[i], 0));
 	return eb;
 
-free_eb:
+out:
 	WARN_ON(!atomic_dec_and_test(&eb->refs));
-	for (i = 0; i < num_pages; i++) {
-		if (eb->pages[i])
-			unlock_page(eb->pages[i]);
+
+	/*
+	 * Any attached folios need to be detached before we unlock them.  This
+	 * is because when we're inserting our new folios into the mapping, and
+	 * then attaching our eb to that folio.  If we fail to insert our folio
+	 * we'll lookup the folio for that index, and grab that EB.  We do not
+	 * want that to grab this eb, as we're getting ready to free it.  So we
+	 * have to detach it first and then unlock it.
+	 *
+	 * We have to drop our reference and NULL it out here because in the
+	 * subpage case detaching does a btrfs_folio_dec_eb_refs() for our eb.
+	 * Below when we call btrfs_release_extent_buffer() we will call
+	 * detach_extent_buffer_folio() on our remaining pages in the !subpage
+	 * case.  If we left eb->folios[i] populated in the subpage case we'd
+	 * double put our reference and be super sad.
+	 */
+	for (int i = 0; i < attached; i++) {
+		ASSERT(eb->folios[i]);
+		detach_extent_buffer_folio(eb, eb->folios[i]);
+		unlock_page(folio_page(eb->folios[i], 0));
+		folio_put(eb->folios[i]);
+		eb->folios[i] = NULL;
 	}
+	/*
+	 * Now all pages of that extent buffer is unmapped, set UNMAPPED flag,
+	 * so it can be cleaned up without utlizing page->mapping.
+	 */
+	set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
 
 	btrfs_release_extent_buffer(eb);
-	return exists;
+	if (ret < 0)
+		return ERR_PTR(ret);
+	ASSERT(existing_eb);
+	return existing_eb;
 }
 
 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
@@ -3713,31 +3878,30 @@ void free_extent_buffer_stale(struct extent_buffer *eb)
 	release_extent_buffer(eb);
 }
 
-static void btree_clear_page_dirty(struct page *page)
+static void btree_clear_folio_dirty(struct folio *folio)
 {
-	ASSERT(PageDirty(page));
-	ASSERT(PageLocked(page));
-	clear_page_dirty_for_io(page);
-	xa_lock_irq(&page->mapping->i_pages);
-	if (!PageDirty(page))
-		__xa_clear_mark(&page->mapping->i_pages,
-				page_index(page), PAGECACHE_TAG_DIRTY);
-	xa_unlock_irq(&page->mapping->i_pages);
+	ASSERT(folio_test_dirty(folio));
+	ASSERT(folio_test_locked(folio));
+	folio_clear_dirty_for_io(folio);
+	xa_lock_irq(&folio->mapping->i_pages);
+	if (!folio_test_dirty(folio))
+		__xa_clear_mark(&folio->mapping->i_pages,
+				folio_index(folio), PAGECACHE_TAG_DIRTY);
+	xa_unlock_irq(&folio->mapping->i_pages);
 }
 
 static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
 {
 	struct btrfs_fs_info *fs_info = eb->fs_info;
-	struct page *page = eb->pages[0];
+	struct folio *folio = eb->folios[0];
 	bool last;
 
-	/* btree_clear_page_dirty() needs page locked */
-	lock_page(page);
-	last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start,
-						  eb->len);
+	/* btree_clear_folio_dirty() needs page locked. */
+	folio_lock(folio);
+	last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, eb->len);
 	if (last)
-		btree_clear_page_dirty(page);
-	unlock_page(page);
+		btree_clear_folio_dirty(folio);
+	folio_unlock(folio);
 	WARN_ON(atomic_read(&eb->refs) == 0);
 }
 
@@ -3745,15 +3909,27 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
 			      struct extent_buffer *eb)
 {
 	struct btrfs_fs_info *fs_info = eb->fs_info;
-	int i;
-	int num_pages;
-	struct page *page;
+	int num_folios;
 
 	btrfs_assert_tree_write_locked(eb);
 
 	if (trans && btrfs_header_generation(eb) != trans->transid)
 		return;
 
+	/*
+	 * Instead of clearing the dirty flag off of the buffer, mark it as
+	 * EXTENT_BUFFER_ZONED_ZEROOUT. This allows us to preserve
+	 * write-ordering in zoned mode, without the need to later re-dirty
+	 * the extent_buffer.
+	 *
+	 * The actual zeroout of the buffer will happen later in
+	 * btree_csum_one_bio.
+	 */
+	if (btrfs_is_zoned(fs_info)) {
+		set_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags);
+		return;
+	}
+
 	if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags))
 		return;
 
@@ -3763,30 +3939,29 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
 	if (eb->fs_info->nodesize < PAGE_SIZE)
 		return clear_subpage_extent_buffer_dirty(eb);
 
-	num_pages = num_extent_pages(eb);
+	num_folios = num_extent_folios(eb);
+	for (int i = 0; i < num_folios; i++) {
+		struct folio *folio = eb->folios[i];
 
-	for (i = 0; i < num_pages; i++) {
-		page = eb->pages[i];
-		if (!PageDirty(page))
+		if (!folio_test_dirty(folio))
 			continue;
-		lock_page(page);
-		btree_clear_page_dirty(page);
-		unlock_page(page);
+		folio_lock(folio);
+		btree_clear_folio_dirty(folio);
+		folio_unlock(folio);
 	}
 	WARN_ON(atomic_read(&eb->refs) == 0);
 }
 
 void set_extent_buffer_dirty(struct extent_buffer *eb)
 {
-	int i;
-	int num_pages;
+	int num_folios;
 	bool was_dirty;
 
 	check_buffer_tree_ref(eb);
 
 	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
 
-	num_pages = num_extent_pages(eb);
+	num_folios = num_extent_folios(eb);
 	WARN_ON(atomic_read(&eb->refs) == 0);
 	WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
 
@@ -3805,34 +3980,32 @@ void set_extent_buffer_dirty(struct extent_buffer *eb)
 		 * the above race.
 		 */
 		if (subpage)
-			lock_page(eb->pages[0]);
-		for (i = 0; i < num_pages; i++)
-			btrfs_page_set_dirty(eb->fs_info, eb->pages[i],
-					     eb->start, eb->len);
+			lock_page(folio_page(eb->folios[0], 0));
+		for (int i = 0; i < num_folios; i++)
+			btrfs_folio_set_dirty(eb->fs_info, eb->folios[i],
+					      eb->start, eb->len);
 		if (subpage)
-			unlock_page(eb->pages[0]);
+			unlock_page(folio_page(eb->folios[0], 0));
 		percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes,
 					 eb->len,
 					 eb->fs_info->dirty_metadata_batch);
 	}
 #ifdef CONFIG_BTRFS_DEBUG
-	for (i = 0; i < num_pages; i++)
-		ASSERT(PageDirty(eb->pages[i]));
+	for (int i = 0; i < num_folios; i++)
+		ASSERT(folio_test_dirty(eb->folios[i]));
 #endif
 }
 
 void clear_extent_buffer_uptodate(struct extent_buffer *eb)
 {
 	struct btrfs_fs_info *fs_info = eb->fs_info;
-	struct page *page;
-	int num_pages;
-	int i;
+	int num_folios = num_extent_folios(eb);
 
 	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-	num_pages = num_extent_pages(eb);
-	for (i = 0; i < num_pages; i++) {
-		page = eb->pages[i];
-		if (!page)
+	for (int i = 0; i < num_folios; i++) {
+		struct folio *folio = eb->folios[i];
+
+		if (!folio)
 			continue;
 
 		/*
@@ -3840,44 +4013,40 @@ void clear_extent_buffer_uptodate(struct extent_buffer *eb)
 		 * btrfs_is_subpage() can not handle cloned/dummy metadata.
 		 */
 		if (fs_info->nodesize >= PAGE_SIZE)
-			ClearPageUptodate(page);
+			folio_clear_uptodate(folio);
 		else
-			btrfs_subpage_clear_uptodate(fs_info, page, eb->start,
-						     eb->len);
+			btrfs_subpage_clear_uptodate(fs_info, folio,
+						     eb->start, eb->len);
 	}
 }
 
 void set_extent_buffer_uptodate(struct extent_buffer *eb)
 {
 	struct btrfs_fs_info *fs_info = eb->fs_info;
-	struct page *page;
-	int num_pages;
-	int i;
+	int num_folios = num_extent_folios(eb);
 
 	set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-	num_pages = num_extent_pages(eb);
-	for (i = 0; i < num_pages; i++) {
-		page = eb->pages[i];
+	for (int i = 0; i < num_folios; i++) {
+		struct folio *folio = eb->folios[i];
 
 		/*
 		 * This is special handling for metadata subpage, as regular
 		 * btrfs_is_subpage() can not handle cloned/dummy metadata.
 		 */
 		if (fs_info->nodesize >= PAGE_SIZE)
-			SetPageUptodate(page);
+			folio_mark_uptodate(folio);
 		else
-			btrfs_subpage_set_uptodate(fs_info, page, eb->start,
-						   eb->len);
+			btrfs_subpage_set_uptodate(fs_info, folio,
+						   eb->start, eb->len);
 	}
 }
 
-static void extent_buffer_read_end_io(struct btrfs_bio *bbio)
+static void end_bbio_meta_read(struct btrfs_bio *bbio)
 {
 	struct extent_buffer *eb = bbio->private;
 	struct btrfs_fs_info *fs_info = eb->fs_info;
 	bool uptodate = !bbio->bio.bi_status;
-	struct bvec_iter_all iter_all;
-	struct bio_vec *bvec;
+	struct folio_iter fi;
 	u32 bio_offset = 0;
 
 	eb->read_mirror = bbio->mirror_num;
@@ -3893,15 +4062,15 @@ static void extent_buffer_read_end_io(struct btrfs_bio *bbio)
 		set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
 	}
 
-	bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
+	bio_for_each_folio_all(fi, &bbio->bio) {
+		struct folio *folio = fi.folio;
 		u64 start = eb->start + bio_offset;
-		struct page *page = bvec->bv_page;
-		u32 len = bvec->bv_len;
+		u32 len = fi.length;
 
 		if (uptodate)
-			btrfs_page_set_uptodate(fs_info, page, start, len);
+			btrfs_folio_set_uptodate(fs_info, folio, start, len);
 		else
-			btrfs_page_clear_uptodate(fs_info, page, start, len);
+			btrfs_folio_clear_uptodate(fs_info, folio, start, len);
 
 		bio_offset += len;
 	}
@@ -3917,8 +4086,8 @@ static void extent_buffer_read_end_io(struct btrfs_bio *bbio)
 int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
 			     struct btrfs_tree_parent_check *check)
 {
-	int num_pages = num_extent_pages(eb), i;
 	struct btrfs_bio *bbio;
+	bool ret;
 
 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
 		return 0;
@@ -3942,17 +4111,24 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
 
 	bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
 			       REQ_OP_READ | REQ_META, eb->fs_info,
-			       extent_buffer_read_end_io, eb);
+			       end_bbio_meta_read, eb);
 	bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
 	bbio->inode = BTRFS_I(eb->fs_info->btree_inode);
 	bbio->file_offset = eb->start;
 	memcpy(&bbio->parent_check, check, sizeof(*check));
 	if (eb->fs_info->nodesize < PAGE_SIZE) {
-		__bio_add_page(&bbio->bio, eb->pages[0], eb->len,
-			       eb->start - page_offset(eb->pages[0]));
+		ret = bio_add_folio(&bbio->bio, eb->folios[0], eb->len,
+				    eb->start - folio_pos(eb->folios[0]));
+		ASSERT(ret);
 	} else {
-		for (i = 0; i < num_pages; i++)
-			__bio_add_page(&bbio->bio, eb->pages[i], PAGE_SIZE, 0);
+		int num_folios = num_extent_folios(eb);
+
+		for (int i = 0; i < num_folios; i++) {
+			struct folio *folio = eb->folios[i];
+
+			ret = bio_add_folio(&bbio->bio, folio, folio_size(folio), 0);
+			ASSERT(ret);
+		}
 	}
 	btrfs_submit_bio(bbio, mirror_num);
 
@@ -3999,29 +4175,33 @@ static inline int check_eb_range(const struct extent_buffer *eb,
 void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
 			unsigned long start, unsigned long len)
 {
+	const int unit_size = folio_size(eb->folios[0]);
 	size_t cur;
 	size_t offset;
-	struct page *page;
-	char *kaddr;
 	char *dst = (char *)dstv;
-	unsigned long i = get_eb_page_index(start);
+	unsigned long i = get_eb_folio_index(eb, start);
 
 	if (check_eb_range(eb, start, len)) {
 		/*
 		 * Invalid range hit, reset the memory, so callers won't get
-		 * some random garbage for their uninitialzed memory.
+		 * some random garbage for their uninitialized memory.
 		 */
 		memset(dstv, 0, len);
 		return;
 	}
 
-	offset = get_eb_offset_in_page(eb, start);
+	if (eb->addr) {
+		memcpy(dstv, eb->addr + start, len);
+		return;
+	}
+
+	offset = get_eb_offset_in_folio(eb, start);
 
 	while (len > 0) {
-		page = eb->pages[i];
+		char *kaddr;
 
-		cur = min(len, (PAGE_SIZE - offset));
-		kaddr = page_address(page);
+		cur = min(len, unit_size - offset);
+		kaddr = folio_address(eb->folios[i]);
 		memcpy(dst, kaddr + offset, cur);
 
 		dst += cur;
@@ -4035,24 +4215,29 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
 				       void __user *dstv,
 				       unsigned long start, unsigned long len)
 {
+	const int unit_size = folio_size(eb->folios[0]);
 	size_t cur;
 	size_t offset;
-	struct page *page;
-	char *kaddr;
 	char __user *dst = (char __user *)dstv;
-	unsigned long i = get_eb_page_index(start);
+	unsigned long i = get_eb_folio_index(eb, start);
 	int ret = 0;
 
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 
-	offset = get_eb_offset_in_page(eb, start);
+	if (eb->addr) {
+		if (copy_to_user_nofault(dstv, eb->addr + start, len))
+			ret = -EFAULT;
+		return ret;
+	}
+
+	offset = get_eb_offset_in_folio(eb, start);
 
 	while (len > 0) {
-		page = eb->pages[i];
+		char *kaddr;
 
-		cur = min(len, (PAGE_SIZE - offset));
-		kaddr = page_address(page);
+		cur = min(len, unit_size - offset);
+		kaddr = folio_address(eb->folios[i]);
 		if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
 			ret = -EFAULT;
 			break;
@@ -4070,25 +4255,25 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
 int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
 			 unsigned long start, unsigned long len)
 {
+	const int unit_size = folio_size(eb->folios[0]);
 	size_t cur;
 	size_t offset;
-	struct page *page;
 	char *kaddr;
 	char *ptr = (char *)ptrv;
-	unsigned long i = get_eb_page_index(start);
+	unsigned long i = get_eb_folio_index(eb, start);
 	int ret = 0;
 
 	if (check_eb_range(eb, start, len))
 		return -EINVAL;
 
-	offset = get_eb_offset_in_page(eb, start);
+	if (eb->addr)
+		return memcmp(ptrv, eb->addr + start, len);
 
-	while (len > 0) {
-		page = eb->pages[i];
+	offset = get_eb_offset_in_folio(eb, start);
 
-		cur = min(len, (PAGE_SIZE - offset));
-
-		kaddr = page_address(page);
+	while (len > 0) {
+		cur = min(len, unit_size - offset);
+		kaddr = folio_address(eb->folios[i]);
 		ret = memcmp(ptr, kaddr + offset, cur);
 		if (ret)
 			break;
@@ -4107,10 +4292,12 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
  * For regular sector size == PAGE_SIZE case, check if @page is uptodate.
  * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE.
  */
-static void assert_eb_page_uptodate(const struct extent_buffer *eb,
-				    struct page *page)
+static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i)
 {
 	struct btrfs_fs_info *fs_info = eb->fs_info;
+	struct folio *folio = eb->folios[i];
+
+	ASSERT(folio);
 
 	/*
 	 * If we are using the commit root we could potentially clear a page
@@ -4124,11 +4311,14 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb,
 		return;
 
 	if (fs_info->nodesize < PAGE_SIZE) {
-		if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, page,
+		struct folio *folio = eb->folios[0];
+
+		ASSERT(i == 0);
+		if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, folio,
 							 eb->start, eb->len)))
-			btrfs_subpage_dump_bitmap(fs_info, page, eb->start, eb->len);
+			btrfs_subpage_dump_bitmap(fs_info, folio, eb->start, eb->len);
 	} else {
-		WARN_ON(!PageUptodate(page));
+		WARN_ON(!folio_test_uptodate(folio));
 	}
 }
 
@@ -4136,29 +4326,34 @@ static void __write_extent_buffer(const struct extent_buffer *eb,
 				  const void *srcv, unsigned long start,
 				  unsigned long len, bool use_memmove)
 {
+	const int unit_size = folio_size(eb->folios[0]);
 	size_t cur;
 	size_t offset;
-	struct page *page;
 	char *kaddr;
 	char *src = (char *)srcv;
-	unsigned long i = get_eb_page_index(start);
+	unsigned long i = get_eb_folio_index(eb, start);
 	/* For unmapped (dummy) ebs, no need to check their uptodate status. */
 	const bool check_uptodate = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
 
-	WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags));
-
 	if (check_eb_range(eb, start, len))
 		return;
 
-	offset = get_eb_offset_in_page(eb, start);
+	if (eb->addr) {
+		if (use_memmove)
+			memmove(eb->addr + start, srcv, len);
+		else
+			memcpy(eb->addr + start, srcv, len);
+		return;
+	}
+
+	offset = get_eb_offset_in_folio(eb, start);
 
 	while (len > 0) {
-		page = eb->pages[i];
 		if (check_uptodate)
-			assert_eb_page_uptodate(eb, page);
+			assert_eb_folio_uptodate(eb, i);
 
-		cur = min(len, PAGE_SIZE - offset);
-		kaddr = page_address(page);
+		cur = min(len, unit_size - offset);
+		kaddr = folio_address(eb->folios[i]);
 		if (use_memmove)
 			memmove(kaddr + offset, src, cur);
 		else
@@ -4180,16 +4375,21 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
 static void memset_extent_buffer(const struct extent_buffer *eb, int c,
 				 unsigned long start, unsigned long len)
 {
+	const int unit_size = folio_size(eb->folios[0]);
 	unsigned long cur = start;
 
+	if (eb->addr) {
+		memset(eb->addr + start, c, len);
+		return;
+	}
+
 	while (cur < start + len) {
-		unsigned long index = get_eb_page_index(cur);
-		unsigned int offset = get_eb_offset_in_page(eb, cur);
-		unsigned int cur_len = min(start + len - cur, PAGE_SIZE - offset);
-		struct page *page = eb->pages[index];
+		unsigned long index = get_eb_folio_index(eb, cur);
+		unsigned int offset = get_eb_offset_in_folio(eb, cur);
+		unsigned int cur_len = min(start + len - cur, unit_size - offset);
 
-		assert_eb_page_uptodate(eb, page);
-		memset(page_address(page) + offset, c, cur_len);
+		assert_eb_folio_uptodate(eb, index);
+		memset(folio_address(eb->folios[index]) + offset, c, cur_len);
 
 		cur += cur_len;
 	}
@@ -4206,15 +4406,16 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
 void copy_extent_buffer_full(const struct extent_buffer *dst,
 			     const struct extent_buffer *src)
 {
+	const int unit_size = folio_size(src->folios[0]);
 	unsigned long cur = 0;
 
 	ASSERT(dst->len == src->len);
 
 	while (cur < src->len) {
-		unsigned long index = get_eb_page_index(cur);
-		unsigned long offset = get_eb_offset_in_page(src, cur);
-		unsigned long cur_len = min(src->len, PAGE_SIZE - offset);
-		void *addr = page_address(src->pages[index]) + offset;
+		unsigned long index = get_eb_folio_index(src, cur);
+		unsigned long offset = get_eb_offset_in_folio(src, cur);
+		unsigned long cur_len = min(src->len, unit_size - offset);
+		void *addr = folio_address(src->folios[index]) + offset;
 
 		write_extent_buffer(dst, addr, cur, cur_len);
 
@@ -4227,12 +4428,12 @@ void copy_extent_buffer(const struct extent_buffer *dst,
 			unsigned long dst_offset, unsigned long src_offset,
 			unsigned long len)
 {
+	const int unit_size = folio_size(dst->folios[0]);
 	u64 dst_len = dst->len;
 	size_t cur;
 	size_t offset;
-	struct page *page;
 	char *kaddr;
-	unsigned long i = get_eb_page_index(dst_offset);
+	unsigned long i = get_eb_folio_index(dst, dst_offset);
 
 	if (check_eb_range(dst, dst_offset, len) ||
 	    check_eb_range(src, src_offset, len))
@@ -4240,15 +4441,14 @@ void copy_extent_buffer(const struct extent_buffer *dst,
 
 	WARN_ON(src->len != dst_len);
 
-	offset = get_eb_offset_in_page(dst, dst_offset);
+	offset = get_eb_offset_in_folio(dst, dst_offset);
 
 	while (len > 0) {
-		page = dst->pages[i];
-		assert_eb_page_uptodate(dst, page);
+		assert_eb_folio_uptodate(dst, i);
 
-		cur = min(len, (unsigned long)(PAGE_SIZE - offset));
+		cur = min(len, (unsigned long)(unit_size - offset));
 
-		kaddr = page_address(page);
+		kaddr = folio_address(dst->folios[i]);
 		read_extent_buffer(src, kaddr + offset, src_offset, cur);
 
 		src_offset += cur;
@@ -4259,22 +4459,22 @@ void copy_extent_buffer(const struct extent_buffer *dst,
 }
 
 /*
- * Calculate the page and offset of the byte containing the given bit number.
+ * Calculate the folio and offset of the byte containing the given bit number.
  *
  * @eb:           the extent buffer
  * @start:        offset of the bitmap item in the extent buffer
  * @nr:           bit number
- * @page_index:   return index of the page in the extent buffer that contains
+ * @folio_index:  return index of the folio in the extent buffer that contains
  *                the given bit number
- * @page_offset:  return offset into the page given by page_index
+ * @folio_offset: return offset into the folio given by folio_index
  *
  * This helper hides the ugliness of finding the byte in an extent buffer which
  * contains a given bit.
  */
 static inline void eb_bitmap_offset(const struct extent_buffer *eb,
 				    unsigned long start, unsigned long nr,
-				    unsigned long *page_index,
-				    size_t *page_offset)
+				    unsigned long *folio_index,
+				    size_t *folio_offset)
 {
 	size_t byte_offset = BIT_BYTE(nr);
 	size_t offset;
@@ -4284,10 +4484,10 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb,
 	 * the bitmap item in the extent buffer + the offset of the byte in the
 	 * bitmap item.
 	 */
-	offset = start + offset_in_page(eb->start) + byte_offset;
+	offset = start + offset_in_folio(eb->folios[0], eb->start) + byte_offset;
 
-	*page_index = offset >> PAGE_SHIFT;
-	*page_offset = offset_in_page(offset);
+	*folio_index = offset >> folio_shift(eb->folios[0]);
+	*folio_offset = offset_in_folio(eb->folios[0], offset);
 }
 
 /*
@@ -4300,25 +4500,23 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb,
 int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
 			   unsigned long nr)
 {
-	u8 *kaddr;
-	struct page *page;
 	unsigned long i;
 	size_t offset;
+	u8 *kaddr;
 
 	eb_bitmap_offset(eb, start, nr, &i, &offset);
-	page = eb->pages[i];
-	assert_eb_page_uptodate(eb, page);
-	kaddr = page_address(page);
+	assert_eb_folio_uptodate(eb, i);
+	kaddr = folio_address(eb->folios[i]);
 	return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
 }
 
 static u8 *extent_buffer_get_byte(const struct extent_buffer *eb, unsigned long bytenr)
 {
-	unsigned long index = get_eb_page_index(bytenr);
+	unsigned long index = get_eb_folio_index(eb, bytenr);
 
 	if (check_eb_range(eb, bytenr, 1))
 		return NULL;
-	return page_address(eb->pages[index]) + get_eb_offset_in_page(eb, bytenr);
+	return folio_address(eb->folios[index]) + get_eb_offset_in_folio(eb, bytenr);
 }
 
 /*
@@ -4403,19 +4601,30 @@ void memcpy_extent_buffer(const struct extent_buffer *dst,
 			  unsigned long dst_offset, unsigned long src_offset,
 			  unsigned long len)
 {
+	const int unit_size = folio_size(dst->folios[0]);
 	unsigned long cur_off = 0;
 
 	if (check_eb_range(dst, dst_offset, len) ||
 	    check_eb_range(dst, src_offset, len))
 		return;
 
+	if (dst->addr) {
+		const bool use_memmove = areas_overlap(src_offset, dst_offset, len);
+
+		if (use_memmove)
+			memmove(dst->addr + dst_offset, dst->addr + src_offset, len);
+		else
+			memcpy(dst->addr + dst_offset, dst->addr + src_offset, len);
+		return;
+	}
+
 	while (cur_off < len) {
 		unsigned long cur_src = cur_off + src_offset;
-		unsigned long pg_index = get_eb_page_index(cur_src);
-		unsigned long pg_off = get_eb_offset_in_page(dst, cur_src);
+		unsigned long folio_index = get_eb_folio_index(dst, cur_src);
+		unsigned long folio_off = get_eb_offset_in_folio(dst, cur_src);
 		unsigned long cur_len = min(src_offset + len - cur_src,
-					    PAGE_SIZE - pg_off);
-		void *src_addr = page_address(dst->pages[pg_index]) + pg_off;
+					    unit_size - folio_off);
+		void *src_addr = folio_address(dst->folios[folio_index]) + folio_off;
 		const bool use_memmove = areas_overlap(src_offset + cur_off,
 						       dst_offset + cur_off, cur_len);
 
@@ -4441,24 +4650,29 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
 		return;
 	}
 
+	if (dst->addr) {
+		memmove(dst->addr + dst_offset, dst->addr + src_offset, len);
+		return;
+	}
+
 	while (len > 0) {
 		unsigned long src_i;
 		size_t cur;
-		size_t dst_off_in_page;
-		size_t src_off_in_page;
+		size_t dst_off_in_folio;
+		size_t src_off_in_folio;
 		void *src_addr;
 		bool use_memmove;
 
-		src_i = get_eb_page_index(src_end);
+		src_i = get_eb_folio_index(dst, src_end);
 
-		dst_off_in_page = get_eb_offset_in_page(dst, dst_end);
-		src_off_in_page = get_eb_offset_in_page(dst, src_end);
+		dst_off_in_folio = get_eb_offset_in_folio(dst, dst_end);
+		src_off_in_folio = get_eb_offset_in_folio(dst, src_end);
 
-		cur = min_t(unsigned long, len, src_off_in_page + 1);
-		cur = min(cur, dst_off_in_page + 1);
+		cur = min_t(unsigned long, len, src_off_in_folio + 1);
+		cur = min(cur, dst_off_in_folio + 1);
 
-		src_addr = page_address(dst->pages[src_i]) + src_off_in_page -
-					cur + 1;
+		src_addr = folio_address(dst->folios[src_i]) + src_off_in_folio -
+					 cur + 1;
 		use_memmove = areas_overlap(src_end - cur + 1, dst_end - cur + 1,
 					    cur);
 
@@ -4520,7 +4734,7 @@ static int try_release_subpage_extent_buffer(struct page *page)
 		struct extent_buffer *eb = NULL;
 
 		/*
-		 * Unlike try_release_extent_buffer() which uses page->private
+		 * Unlike try_release_extent_buffer() which uses folio private
 		 * to grab buffer, for subpage case we rely on radix tree, thus
 		 * we need to ensure radix tree consistency.
 		 *
@@ -4560,17 +4774,17 @@ static int try_release_subpage_extent_buffer(struct page *page)
 
 		/*
 		 * Here we don't care about the return value, we will always
-		 * check the page private at the end.  And
+		 * check the folio private at the end.  And
 		 * release_extent_buffer() will release the refs_lock.
 		 */
 		release_extent_buffer(eb);
 	}
 	/*
-	 * Finally to check if we have cleared page private, as if we have
-	 * released all ebs in the page, the page private should be cleared now.
+	 * Finally to check if we have cleared folio private, as if we have
+	 * released all ebs in the page, the folio private should be cleared now.
 	 */
 	spin_lock(&page->mapping->i_private_lock);
-	if (!PagePrivate(page))
+	if (!folio_test_private(page_folio(page)))
 		ret = 1;
 	else
 		ret = 0;
@@ -4581,22 +4795,23 @@ static int try_release_subpage_extent_buffer(struct page *page)
 
 int try_release_extent_buffer(struct page *page)
 {
+	struct folio *folio = page_folio(page);
 	struct extent_buffer *eb;
 
 	if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
 		return try_release_subpage_extent_buffer(page);
 
 	/*
-	 * We need to make sure nobody is changing page->private, as we rely on
-	 * page->private as the pointer to extent buffer.
+	 * We need to make sure nobody is changing folio private, as we rely on
+	 * folio private as the pointer to extent buffer.
 	 */
 	spin_lock(&page->mapping->i_private_lock);
-	if (!PagePrivate(page)) {
+	if (!folio_test_private(folio)) {
 		spin_unlock(&page->mapping->i_private_lock);
 		return 1;
 	}
 
-	eb = (struct extent_buffer *)page->private;
+	eb = folio_get_private(folio);
 	BUG_ON(!eb);
 
 	/*
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 2171057a4477..46050500529b 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -28,7 +28,8 @@ enum {
 	EXTENT_BUFFER_IN_TREE,
 	/* write IO error */
 	EXTENT_BUFFER_WRITE_ERR,
-	EXTENT_BUFFER_NO_CHECK,
+	/* Indicate the extent buffer is written zeroed out (for zoned) */
+	EXTENT_BUFFER_ZONED_ZEROOUT,
 	/* Indicate that extent buffer pages a being read */
 	EXTENT_BUFFER_READING,
 };
@@ -43,10 +44,10 @@ enum {
 };
 
 /*
- * page->private values.  Every page that is controlled by the extent
- * map has page->private set to one.
+ * Folio private values.  Every page that is controlled by the extent map has
+ * folio private set to this value.
  */
-#define EXTENT_PAGE_PRIVATE 1
+#define EXTENT_FOLIO_PRIVATE			1
 
 /*
  * The extent buffer bitmap operations are done with byte granularity instead of
@@ -77,6 +78,13 @@ struct extent_buffer {
 	unsigned long len;
 	unsigned long bflags;
 	struct btrfs_fs_info *fs_info;
+
+	/*
+	 * The address where the eb can be accessed without any cross-page handling.
+	 * This can be NULL if not possible.
+	 */
+	void *addr;
+
 	spinlock_t refs_lock;
 	atomic_t refs;
 	int read_mirror;
@@ -86,7 +94,12 @@ struct extent_buffer {
 
 	struct rw_semaphore lock;
 
-	struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
+	/*
+	 * Pointers to all the folios of the extent buffer.
+	 *
+	 * For now the folio is always order 0 (aka, a single page).
+	 */
+	struct folio *folios[INLINE_EXTENT_BUFFER_PAGES];
 #ifdef CONFIG_BTRFS_DEBUG
 	struct list_head leak_list;
 	pid_t lock_owner;
@@ -108,29 +121,43 @@ struct btrfs_eb_write_context {
  *
  * Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases.
  */
-static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb,
-					   unsigned long offset)
+static inline size_t get_eb_offset_in_folio(const struct extent_buffer *eb,
+					    unsigned long offset)
 {
 	/*
-	 * For sectorsize == PAGE_SIZE case, eb->start will always be aligned
-	 * to PAGE_SIZE, thus adding it won't cause any difference.
+	 * 1) sectorsize == PAGE_SIZE and nodesize >= PAGE_SIZE case
+	 *    1.1) One large folio covering the whole eb
+	 *	   The eb->start is aligned to folio size, thus adding it
+	 *	   won't cause any difference.
+	 *    1.2) Several page sized folios
+	 *	   The eb->start is aligned to folio (page) size, thus
+	 *	   adding it won't cause any difference.
 	 *
-	 * For sectorsize < PAGE_SIZE, we must only read the data that belongs
-	 * to the eb, thus we have to take the eb->start into consideration.
+	 * 2) sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE case
+	 *    In this case there would only be one page sized folio, and there
+	 *    may be several different extent buffers in the page/folio.
+	 *    We need to add eb->start to properly access the offset inside
+	 *    that eb.
 	 */
-	return offset_in_page(offset + eb->start);
+	return offset_in_folio(eb->folios[0], offset + eb->start);
 }
 
-static inline unsigned long get_eb_page_index(unsigned long offset)
+static inline unsigned long get_eb_folio_index(const struct extent_buffer *eb,
+					       unsigned long offset)
 {
 	/*
-	 * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough.
+	 * 1) sectorsize == PAGE_SIZE and nodesize >= PAGE_SIZE case
+	 *    1.1) One large folio covering the whole eb.
+	 *	   the folio_shift would be large enough to always make us
+	 *	   return 0 as index.
+	 *    1.2) Several page sized folios
+	 *         The folio_shift() would be PAGE_SHIFT, giving us the correct
+	 *         index.
 	 *
-	 * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE,
-	 * and have ensured that all tree blocks are contained in one page,
-	 * thus we always get index == 0.
+	 * 2) sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE case
+	 *    The folio would only be page sized, and always give us 0 as index.
 	 */
-	return offset >> PAGE_SHIFT;
+	return offset >> folio_shift(eb->folios[0]);
 }
 
 /*
@@ -230,6 +257,20 @@ static inline int num_extent_pages(const struct extent_buffer *eb)
 	return (eb->len >> PAGE_SHIFT) ?: 1;
 }
 
+/*
+ * This can only be determined at runtime by checking eb::folios[0].
+ *
+ * As we can have either one large folio covering the whole eb
+ * (either nodesize <= PAGE_SIZE, or high order folio), or multiple
+ * single-paged folios.
+ */
+static inline int num_extent_folios(const struct extent_buffer *eb)
+{
+	if (folio_order(eb->folios[0]))
+		return 1;
+	return num_extent_pages(eb);
+}
+
 static inline int extent_buffer_uptodate(const struct extent_buffer *eb)
 {
 	return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
@@ -294,7 +335,8 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
 void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
 			      struct extent_buffer *buf);
 
-int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array);
+int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
+			   gfp_t extra_gfp);
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 bool find_lock_delalloc_range(struct inode *inode,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index a6d8368ed0ed..b61099bf97a8 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -50,7 +50,6 @@ struct extent_map *alloc_extent_map(void)
 	if (!em)
 		return NULL;
 	RB_CLEAR_NODE(&em->rb_node);
-	em->compress_type = BTRFS_COMPRESS_NONE;
 	refcount_set(&em->refs, 1);
 	INIT_LIST_HEAD(&em->list);
 	return em;
@@ -67,8 +66,6 @@ void free_extent_map(struct extent_map *em)
 	if (refcount_dec_and_test(&em->refs)) {
 		WARN_ON(extent_map_in_tree(em));
 		WARN_ON(!list_empty(&em->list));
-		if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
-			kfree(em->map_lookup);
 		kmem_cache_free(extent_map_cache, em);
 	}
 }
@@ -182,50 +179,50 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
 	return NULL;
 }
 
-/* Check to see if two extent_map structs are adjacent and safe to merge. */
-static int mergable_maps(struct extent_map *prev, struct extent_map *next)
+static inline u64 extent_map_block_end(const struct extent_map *em)
 {
-	if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
-		return 0;
+	if (em->block_start + em->block_len < em->block_start)
+		return (u64)-1;
+	return em->block_start + em->block_len;
+}
 
-	/*
-	 * don't merge compressed extents, we need to know their
-	 * actual size
-	 */
-	if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
-		return 0;
+static bool can_merge_extent_map(const struct extent_map *em)
+{
+	if (em->flags & EXTENT_FLAG_PINNED)
+		return false;
+
+	/* Don't merge compressed extents, we need to know their actual size. */
+	if (extent_map_is_compressed(em))
+		return false;
 
-	if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) ||
-	    test_bit(EXTENT_FLAG_LOGGING, &next->flags))
-		return 0;
+	if (em->flags & EXTENT_FLAG_LOGGING)
+		return false;
 
 	/*
 	 * We don't want to merge stuff that hasn't been written to the log yet
 	 * since it may not reflect exactly what is on disk, and that would be
 	 * bad.
 	 */
-	if (!list_empty(&prev->list) || !list_empty(&next->list))
-		return 0;
-
-	ASSERT(next->block_start != EXTENT_MAP_DELALLOC &&
-	       prev->block_start != EXTENT_MAP_DELALLOC);
-
-	if (prev->map_lookup || next->map_lookup)
-		ASSERT(test_bit(EXTENT_FLAG_FS_MAPPING, &prev->flags) &&
-		       test_bit(EXTENT_FLAG_FS_MAPPING, &next->flags));
-
-	if (extent_map_end(prev) == next->start &&
-	    prev->flags == next->flags &&
-	    prev->map_lookup == next->map_lookup &&
-	    ((next->block_start == EXTENT_MAP_HOLE &&
-	      prev->block_start == EXTENT_MAP_HOLE) ||
-	     (next->block_start == EXTENT_MAP_INLINE &&
-	      prev->block_start == EXTENT_MAP_INLINE) ||
-	     (next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
-	      next->block_start == extent_map_block_end(prev)))) {
-		return 1;
-	}
-	return 0;
+	if (!list_empty(&em->list))
+		return false;
+
+	return true;
+}
+
+/* Check to see if two extent_map structs are adjacent and safe to merge. */
+static bool mergeable_maps(const struct extent_map *prev, const struct extent_map *next)
+{
+	if (extent_map_end(prev) != next->start)
+		return false;
+
+	if (prev->flags != next->flags)
+		return false;
+
+	if (next->block_start < EXTENT_MAP_LAST_BYTE - 1)
+		return next->block_start == extent_map_block_end(prev);
+
+	/* HOLES and INLINE extents. */
+	return next->block_start == prev->block_start;
 }
 
 static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
@@ -244,11 +241,14 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
 	if (refcount_read(&em->refs) > 2)
 		return;
 
+	if (!can_merge_extent_map(em))
+		return;
+
 	if (em->start != 0) {
 		rb = rb_prev(&em->rb_node);
 		if (rb)
 			merge = rb_entry(rb, struct extent_map, rb_node);
-		if (rb && mergable_maps(merge, em)) {
+		if (rb && can_merge_extent_map(merge) && mergeable_maps(merge, em)) {
 			em->start = merge->start;
 			em->orig_start = merge->orig_start;
 			em->len += merge->len;
@@ -257,7 +257,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
 			em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
 			em->mod_start = merge->mod_start;
 			em->generation = max(em->generation, merge->generation);
-			set_bit(EXTENT_FLAG_MERGED, &em->flags);
+			em->flags |= EXTENT_FLAG_MERGED;
 
 			rb_erase_cached(&merge->rb_node, &tree->map);
 			RB_CLEAR_NODE(&merge->rb_node);
@@ -268,14 +268,14 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
 	rb = rb_next(&em->rb_node);
 	if (rb)
 		merge = rb_entry(rb, struct extent_map, rb_node);
-	if (rb && mergable_maps(em, merge)) {
+	if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) {
 		em->len += merge->len;
 		em->block_len += merge->block_len;
 		rb_erase_cached(&merge->rb_node, &tree->map);
 		RB_CLEAR_NODE(&merge->rb_node);
 		em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
 		em->generation = max(em->generation, merge->generation);
-		set_bit(EXTENT_FLAG_MERGED, &em->flags);
+		em->flags |= EXTENT_FLAG_MERGED;
 		free_extent_map(merge);
 	}
 }
@@ -283,7 +283,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
 /*
  * Unpin an extent from the cache.
  *
- * @tree:	tree to unpin the extent in
+ * @inode:	the inode from which we are unpinning an extent range
  * @start:	logical offset in the file
  * @len:	length of the extent
  * @gen:	generation that this extent has been modified in
@@ -292,9 +292,10 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
  * to the generation that actually added the file item to the inode so we know
  * we need to sync this extent when we call fsync().
  */
-int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
-		       u64 gen)
+int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen)
 {
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	struct extent_map_tree *tree = &inode->extent_tree;
 	int ret = 0;
 	struct extent_map *em;
 	bool prealloc = false;
@@ -302,19 +303,28 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
 	write_lock(&tree->lock);
 	em = lookup_extent_mapping(tree, start, len);
 
-	WARN_ON(!em || em->start != start);
-
-	if (!em)
+	if (WARN_ON(!em)) {
+		btrfs_warn(fs_info,
+"no extent map found for inode %llu (root %lld) when unpinning extent range [%llu, %llu), generation %llu",
+			   btrfs_ino(inode), btrfs_root_id(inode->root),
+			   start, len, gen);
 		goto out;
+	}
+
+	if (WARN_ON(em->start != start))
+		btrfs_warn(fs_info,
+"found extent map for inode %llu (root %lld) with unexpected start offset %llu when unpinning extent range [%llu, %llu), generation %llu",
+			   btrfs_ino(inode), btrfs_root_id(inode->root),
+			   em->start, start, len, gen);
 
 	em->generation = gen;
-	clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+	em->flags &= ~EXTENT_FLAG_PINNED;
 	em->mod_start = em->start;
 	em->mod_len = em->len;
 
-	if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) {
+	if (em->flags & EXTENT_FLAG_FILLING) {
 		prealloc = true;
-		clear_bit(EXTENT_FLAG_FILLING, &em->flags);
+		em->flags &= ~EXTENT_FLAG_FILLING;
 	}
 
 	try_merge_map(tree, em);
@@ -335,7 +345,7 @@ void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
 {
 	lockdep_assert_held_write(&tree->lock);
 
-	clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+	em->flags &= ~EXTENT_FLAG_LOGGING;
 	if (extent_map_in_tree(em))
 		try_merge_map(tree, em);
 }
@@ -348,45 +358,14 @@ static inline void setup_extent_mapping(struct extent_map_tree *tree,
 	em->mod_start = em->start;
 	em->mod_len = em->len;
 
+	ASSERT(list_empty(&em->list));
+
 	if (modified)
-		list_move(&em->list, &tree->modified_extents);
+		list_add(&em->list, &tree->modified_extents);
 	else
 		try_merge_map(tree, em);
 }
 
-static void extent_map_device_set_bits(struct extent_map *em, unsigned bits)
-{
-	struct map_lookup *map = em->map_lookup;
-	u64 stripe_size = em->orig_block_len;
-	int i;
-
-	for (i = 0; i < map->num_stripes; i++) {
-		struct btrfs_io_stripe *stripe = &map->stripes[i];
-		struct btrfs_device *device = stripe->dev;
-
-		set_extent_bit(&device->alloc_state, stripe->physical,
-			       stripe->physical + stripe_size - 1,
-			       bits | EXTENT_NOWAIT, NULL);
-	}
-}
-
-static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
-{
-	struct map_lookup *map = em->map_lookup;
-	u64 stripe_size = em->orig_block_len;
-	int i;
-
-	for (i = 0; i < map->num_stripes; i++) {
-		struct btrfs_io_stripe *stripe = &map->stripes[i];
-		struct btrfs_device *device = stripe->dev;
-
-		__clear_extent_bit(&device->alloc_state, stripe->physical,
-				   stripe->physical + stripe_size - 1,
-				   bits | EXTENT_NOWAIT,
-				   NULL, NULL);
-	}
-}
-
 /*
  * Add new extent map to the extent tree
  *
@@ -400,8 +379,8 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
  * into the tree directly, with an additional reference taken, or a
  * reference dropped if the merge attempt was successful.
  */
-int add_extent_mapping(struct extent_map_tree *tree,
-		       struct extent_map *em, int modified)
+static int add_extent_mapping(struct extent_map_tree *tree,
+			      struct extent_map *em, int modified)
 {
 	int ret = 0;
 
@@ -412,10 +391,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
 		goto out;
 
 	setup_extent_mapping(tree, em, modified);
-	if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) {
-		extent_map_device_set_bits(em, CHUNK_ALLOCATED);
-		extent_map_device_clear_bits(em, CHUNK_TRIMMED);
-	}
 out:
 	return ret;
 }
@@ -495,12 +470,10 @@ void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
 {
 	lockdep_assert_held_write(&tree->lock);
 
-	WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
+	WARN_ON(em->flags & EXTENT_FLAG_PINNED);
 	rb_erase_cached(&em->rb_node, &tree->map);
-	if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+	if (!(em->flags & EXTENT_FLAG_LOGGING))
 		list_del_init(&em->list);
-	if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
-		extent_map_device_clear_bits(em, CHUNK_ALLOCATED);
 	RB_CLEAR_NODE(&em->rb_node);
 }
 
@@ -511,9 +484,9 @@ static void replace_extent_mapping(struct extent_map_tree *tree,
 {
 	lockdep_assert_held_write(&tree->lock);
 
-	WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags));
+	WARN_ON(cur->flags & EXTENT_FLAG_PINNED);
 	ASSERT(extent_map_in_tree(cur));
-	if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
+	if (!(cur->flags & EXTENT_FLAG_LOGGING))
 		list_del_init(&cur->list);
 	rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map);
 	RB_CLEAR_NODE(&cur->rb_node);
@@ -576,7 +549,7 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
 	em->start = start;
 	em->len = end - start;
 	if (em->block_start < EXTENT_MAP_LAST_BYTE &&
-	    !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+	    !extent_map_is_compressed(em)) {
 		em->block_start += start_diff;
 		em->block_len = em->len;
 	}
@@ -626,8 +599,6 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
 	if (ret == -EEXIST) {
 		struct extent_map *existing;
 
-		ret = 0;
-
 		existing = search_extent_mapping(em_tree, start, len);
 
 		trace_btrfs_handle_em_exist(fs_info, existing, em, start, len);
@@ -681,8 +652,7 @@ static void drop_all_extent_maps_fast(struct extent_map_tree *tree)
 
 		node = rb_first_cached(&tree->map);
 		em = rb_entry(node, struct extent_map, rb_node);
-		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
-		clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+		em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING);
 		remove_extent_mapping(tree, em);
 		free_extent_map(em);
 		cond_resched_rwlock_write(&tree->lock);
@@ -758,19 +728,18 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
 			}
 		}
 
-		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
+		if (skip_pinned && (em->flags & EXTENT_FLAG_PINNED)) {
 			start = em_end;
 			goto next;
 		}
 
 		flags = em->flags;
-		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
 		/*
 		 * In case we split the extent map, we want to preserve the
 		 * EXTENT_FLAG_LOGGING flag on our extent map, but we don't want
 		 * it on the new extent maps.
 		 */
-		clear_bit(EXTENT_FLAG_LOGGING, &flags);
+		em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING);
 		modified = !list_empty(&em->list);
 
 		/*
@@ -781,7 +750,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
 			goto remove_em;
 
 		gen = em->generation;
-		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+		compressed = extent_map_is_compressed(em);
 
 		if (em->start < start) {
 			if (!split) {
@@ -814,7 +783,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
 
 			split->generation = gen;
 			split->flags = flags;
-			split->compress_type = em->compress_type;
 			replace_extent_mapping(em_tree, em, split, modified);
 			free_extent_map(split);
 			split = split2;
@@ -831,7 +799,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
 			split->len = em_end - end;
 			split->block_start = em->block_start;
 			split->flags = flags;
-			split->compress_type = em->compress_type;
 			split->generation = gen;
 
 			if (em->block_start < EXTENT_MAP_LAST_BYTE) {
@@ -997,14 +964,14 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
 	}
 
 	ASSERT(em->len == len);
-	ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
+	ASSERT(!extent_map_is_compressed(em));
 	ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
-	ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags));
-	ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags));
+	ASSERT(em->flags & EXTENT_FLAG_PINNED);
+	ASSERT(!(em->flags & EXTENT_FLAG_LOGGING));
 	ASSERT(!list_empty(&em->list));
 
 	flags = em->flags;
-	clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+	em->flags &= ~EXTENT_FLAG_PINNED;
 
 	/* First, replace the em with a new extent_map starting from * em->start */
 	split_pre->start = em->start;
@@ -1015,7 +982,6 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
 	split_pre->orig_block_len = split_pre->block_len;
 	split_pre->ram_bytes = split_pre->len;
 	split_pre->flags = flags;
-	split_pre->compress_type = em->compress_type;
 	split_pre->generation = em->generation;
 
 	replace_extent_mapping(em_tree, em, split_pre, 1);
@@ -1034,7 +1000,6 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
 	split_mid->orig_block_len = split_mid->block_len;
 	split_mid->ram_bytes = split_mid->len;
 	split_mid->flags = flags;
-	split_mid->compress_type = em->compress_type;
 	split_mid->generation = em->generation;
 	add_extent_mapping(em_tree, split_mid, 1);
 
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 35d27c756e08..e380fc08bbe4 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -5,30 +5,33 @@
 
 #include <linux/rbtree.h>
 #include <linux/refcount.h>
+#include "compression.h"
 
 #define EXTENT_MAP_LAST_BYTE ((u64)-4)
 #define EXTENT_MAP_HOLE ((u64)-3)
 #define EXTENT_MAP_INLINE ((u64)-2)
-/* used only during fiemap calls */
-#define EXTENT_MAP_DELALLOC ((u64)-1)
 
 /* bits for the extent_map::flags field */
 enum {
 	/* this entry not yet on disk, don't free it */
-	EXTENT_FLAG_PINNED,
-	EXTENT_FLAG_COMPRESSED,
+	ENUM_BIT(EXTENT_FLAG_PINNED),
+	ENUM_BIT(EXTENT_FLAG_COMPRESS_ZLIB),
+	ENUM_BIT(EXTENT_FLAG_COMPRESS_LZO),
+	ENUM_BIT(EXTENT_FLAG_COMPRESS_ZSTD),
 	/* pre-allocated extent */
-	EXTENT_FLAG_PREALLOC,
+	ENUM_BIT(EXTENT_FLAG_PREALLOC),
 	/* Logging this extent */
-	EXTENT_FLAG_LOGGING,
+	ENUM_BIT(EXTENT_FLAG_LOGGING),
 	/* Filling in a preallocated extent */
-	EXTENT_FLAG_FILLING,
-	/* filesystem extent mapping type */
-	EXTENT_FLAG_FS_MAPPING,
+	ENUM_BIT(EXTENT_FLAG_FILLING),
 	/* This em is merged from two or more physically adjacent ems */
-	EXTENT_FLAG_MERGED,
+	ENUM_BIT(EXTENT_FLAG_MERGED),
 };
 
+/*
+ * Keep this structure as compact as possible, as we can have really large
+ * amounts of allocated extent maps at any time.
+ */
 struct extent_map {
 	struct rb_node rb_node;
 
@@ -49,11 +52,8 @@ struct extent_map {
 	 * For non-merged extents, it's from btrfs_file_extent_item::generation.
 	 */
 	u64 generation;
-	unsigned long flags;
-	/* Used for chunk mappings, flag EXTENT_FLAG_FS_MAPPING must be set */
-	struct map_lookup *map_lookup;
+	u32 flags;
 	refcount_t refs;
-	unsigned int compress_type;
 	struct list_head list;
 };
 
@@ -65,30 +65,57 @@ struct extent_map_tree {
 
 struct btrfs_inode;
 
+static inline void extent_map_set_compression(struct extent_map *em,
+					      enum btrfs_compression_type type)
+{
+	if (type == BTRFS_COMPRESS_ZLIB)
+		em->flags |= EXTENT_FLAG_COMPRESS_ZLIB;
+	else if (type == BTRFS_COMPRESS_LZO)
+		em->flags |= EXTENT_FLAG_COMPRESS_LZO;
+	else if (type == BTRFS_COMPRESS_ZSTD)
+		em->flags |= EXTENT_FLAG_COMPRESS_ZSTD;
+}
+
+static inline enum btrfs_compression_type extent_map_compression(const struct extent_map *em)
+{
+	if (em->flags & EXTENT_FLAG_COMPRESS_ZLIB)
+		return BTRFS_COMPRESS_ZLIB;
+
+	if (em->flags & EXTENT_FLAG_COMPRESS_LZO)
+		return BTRFS_COMPRESS_LZO;
+
+	if (em->flags & EXTENT_FLAG_COMPRESS_ZSTD)
+		return BTRFS_COMPRESS_ZSTD;
+
+	return BTRFS_COMPRESS_NONE;
+}
+
+/*
+ * More efficient way to determine if extent is compressed, instead of using
+ * 'extent_map_compression() != BTRFS_COMPRESS_NONE'.
+ */
+static inline bool extent_map_is_compressed(const struct extent_map *em)
+{
+	return (em->flags & (EXTENT_FLAG_COMPRESS_ZLIB |
+			     EXTENT_FLAG_COMPRESS_LZO |
+			     EXTENT_FLAG_COMPRESS_ZSTD)) != 0;
+}
+
 static inline int extent_map_in_tree(const struct extent_map *em)
 {
 	return !RB_EMPTY_NODE(&em->rb_node);
 }
 
-static inline u64 extent_map_end(struct extent_map *em)
+static inline u64 extent_map_end(const struct extent_map *em)
 {
 	if (em->start + em->len < em->start)
 		return (u64)-1;
 	return em->start + em->len;
 }
 
-static inline u64 extent_map_block_end(struct extent_map *em)
-{
-	if (em->block_start + em->block_len < em->block_start)
-		return (u64)-1;
-	return em->block_start + em->block_len;
-}
-
 void extent_map_tree_init(struct extent_map_tree *tree);
 struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
 					 u64 start, u64 len);
-int add_extent_mapping(struct extent_map_tree *tree,
-		       struct extent_map *em, int modified);
 void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
 int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
 		     u64 new_logical);
@@ -97,7 +124,7 @@ struct extent_map *alloc_extent_map(void);
 void free_extent_map(struct extent_map *em);
 int __init extent_map_init(void);
 void __cold extent_map_exit(void);
-int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
+int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen);
 void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
 struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
 					 u64 start, u64 len);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 45cae356e89b..81ac1d474bf1 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -59,7 +59,7 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz
 		goto out_unlock;
 	}
 
-	ret = find_contiguous_extent_bit(&inode->file_extent_tree, 0, &start,
+	ret = find_contiguous_extent_bit(inode->file_extent_tree, 0, &start,
 					 &end, EXTENT_DIRTY);
 	if (!ret && start == 0)
 		i_size = min(i_size, end + 1);
@@ -94,7 +94,7 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
 
 	if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
 		return 0;
-	return set_extent_bit(&inode->file_extent_tree, start, start + len - 1,
+	return set_extent_bit(inode->file_extent_tree, start, start + len - 1,
 			      EXTENT_DIRTY, NULL);
 }
 
@@ -123,7 +123,7 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
 
 	if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
 		return 0;
-	return clear_extent_bit(&inode->file_extent_tree, start,
+	return clear_extent_bit(inode->file_extent_tree, start,
 				start + len - 1, EXTENT_DIRTY, NULL);
 }
 
@@ -1294,8 +1294,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
 			return;
 		}
 		if (compress_type != BTRFS_COMPRESS_NONE) {
-			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
-			em->compress_type = compress_type;
+			extent_map_set_compression(em, compress_type);
 			em->block_start = bytenr;
 			em->block_len = em->orig_block_len;
 		} else {
@@ -1303,7 +1302,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
 			em->block_start = bytenr;
 			em->block_len = em->len;
 			if (type == BTRFS_FILE_EXTENT_PREALLOC)
-				set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+				em->flags |= EXTENT_FLAG_PREALLOC;
 		}
 	} else if (type == BTRFS_FILE_EXTENT_INLINE) {
 		em->block_start = EXTENT_MAP_INLINE;
@@ -1315,9 +1314,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
 		 */
 		em->orig_start = EXTENT_MAP_HOLE;
 		em->block_len = (u64)-1;
-		em->compress_type = compress_type;
-		if (compress_type != BTRFS_COMPRESS_NONE)
-			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+		extent_map_set_compression(em, compress_type);
 	} else {
 		btrfs_err(fs_info,
 			  "unknown file extent item type %d, inode %llu, offset %llu, "
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 32611a4edd6b..38dfcac47609 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -111,8 +111,8 @@ static void btrfs_drop_pages(struct btrfs_fs_info *fs_info,
 		 * accessed as prepare_pages should have marked them accessed
 		 * in prepare_pages via find_or_create_page()
 		 */
-		btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start,
-					       block_len);
+		btrfs_folio_clamp_clear_checked(fs_info, page_folio(pages[i]),
+						block_start, block_len);
 		unlock_page(pages[i]);
 		put_page(pages[i]);
 	}
@@ -168,9 +168,12 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
 	for (i = 0; i < num_pages; i++) {
 		struct page *p = pages[i];
 
-		btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes);
-		btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes);
-		btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes);
+		btrfs_folio_clamp_set_uptodate(fs_info, page_folio(p),
+					       start_pos, num_bytes);
+		btrfs_folio_clamp_clear_checked(fs_info, page_folio(p),
+						start_pos, num_bytes);
+		btrfs_folio_clamp_set_dirty(fs_info, page_folio(p),
+					    start_pos, num_bytes);
 	}
 
 	/*
@@ -869,9 +872,9 @@ static int prepare_uptodate_page(struct inode *inode,
 		 * released.
 		 *
 		 * The private flag check is essential for subpage as we need
-		 * to store extra bitmap using page->private.
+		 * to store extra bitmap using folio private.
 		 */
-		if (page->mapping != inode->i_mapping || !PagePrivate(page)) {
+		if (page->mapping != inode->i_mapping || !folio_test_private(folio)) {
 			unlock_page(page);
 			return -EAGAIN;
 		}
@@ -2150,7 +2153,6 @@ out:
 		hole_em->block_start = EXTENT_MAP_HOLE;
 		hole_em->block_len = 0;
 		hole_em->orig_block_len = 0;
-		hole_em->compress_type = BTRFS_COMPRESS_NONE;
 		hole_em->generation = trans->transid;
 
 		ret = btrfs_replace_extent_map_range(inode, hole_em, true);
@@ -2839,7 +2841,7 @@ static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
 
 	if (em->block_start == EXTENT_MAP_HOLE)
 		ret = RANGE_BOUNDARY_HOLE;
-	else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+	else if (em->flags & EXTENT_FLAG_PREALLOC)
 		ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
 	else
 		ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
@@ -2879,8 +2881,7 @@ static int btrfs_zero_range(struct inode *inode,
 	 * extents and holes, we drop all the existing extents and allocate a
 	 * new prealloc extent, so that we get a larger contiguous disk extent.
 	 */
-	if (em->start <= alloc_start &&
-	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+	if (em->start <= alloc_start && (em->flags & EXTENT_FLAG_PREALLOC)) {
 		const u64 em_end = em->start + em->len;
 
 		if (em_end >= offset + len) {
@@ -2915,7 +2916,7 @@ static int btrfs_zero_range(struct inode *inode,
 			goto out;
 		}
 
-		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+		if (em->flags & EXTENT_FLAG_PREALLOC) {
 			free_extent_map(em);
 			ret = btrfs_fallocate_update_isize(inode, offset + len,
 							   mode);
@@ -3136,7 +3137,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 		last_byte = ALIGN(last_byte, blocksize);
 		if (em->block_start == EXTENT_MAP_HOLE ||
 		    (cur_offset >= inode->i_size &&
-		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+		     !(em->flags & EXTENT_FLAG_PREALLOC))) {
 			const u64 range_len = last_byte - cur_offset;
 
 			ret = add_falloc_range(&reserve_list, cur_offset, range_len);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6f93c9a2c3e3..d372c7ce0e6b 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -439,8 +439,8 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
 
 	for (i = 0; i < io_ctl->num_pages; i++) {
 		if (io_ctl->pages[i]) {
-			btrfs_page_clear_checked(io_ctl->fs_info,
-					io_ctl->pages[i],
+			btrfs_folio_clear_checked(io_ctl->fs_info,
+					page_folio(io_ctl->pages[i]),
 					page_offset(io_ctl->pages[i]),
 					PAGE_SIZE);
 			unlock_page(io_ctl->pages[i]);
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 318df6f9d9cb..f8bb73d6ab68 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -188,6 +188,7 @@ enum {
 	BTRFS_MOUNT_IGNOREBADROOTS		= (1UL << 27),
 	BTRFS_MOUNT_IGNOREDATACSUMS		= (1UL << 28),
 	BTRFS_MOUNT_NODISCARD			= (1UL << 29),
+	BTRFS_MOUNT_NOSPACECACHE		= (1UL << 30),
 };
 
 /*
@@ -398,7 +399,8 @@ struct btrfs_fs_info {
 	struct extent_io_tree excluded_extents;
 
 	/* logical->physical extent mapping */
-	struct extent_map_tree mapping_tree;
+	struct rb_root_cached mapping_tree;
+	rwlock_t mapping_tree_lock;
 
 	/*
 	 * Block reservation for extent, checksum, root tree and delayed dir
@@ -960,20 +962,6 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
 #define btrfs_test_opt(fs_info, opt)	((fs_info)->mount_opt & \
 					 BTRFS_MOUNT_##opt)
 
-#define btrfs_set_and_info(fs_info, opt, fmt, args...)			\
-do {									\
-	if (!btrfs_test_opt(fs_info, opt))				\
-		btrfs_info(fs_info, fmt, ##args);			\
-	btrfs_set_opt(fs_info->mount_opt, opt);				\
-} while (0)
-
-#define btrfs_clear_and_info(fs_info, opt, fmt, args...)		\
-do {									\
-	if (btrfs_test_opt(fs_info, opt))				\
-		btrfs_info(fs_info, fmt, ##args);			\
-	btrfs_clear_opt(fs_info->mount_opt, opt);			\
-} while (0)
-
 static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
 {
 	/* Do it this way so we only ever do one test_bit in the normal case. */
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index fea464b2a54e..809b11472a80 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -114,6 +114,15 @@ struct data_reloc_warn {
 	int mirror_num;
 };
 
+/*
+ * For the file_extent_tree, we want to hold the inode lock when we lookup and
+ * update the disk_i_size, but lockdep will complain because our io_tree we hold
+ * the tree lock and get the inode lock when setting delalloc. These two things
+ * are unrelated, so make a class for the file_extent_tree so we don't get the
+ * two locking patterns mixed up.
+ */
+static struct lock_class_key file_extent_tree_class;
+
 static const struct inode_operations btrfs_dir_inode_operations;
 static const struct inode_operations btrfs_symlink_inode_operations;
 static const struct inode_operations btrfs_special_inode_operations;
@@ -447,8 +456,8 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
 		 * range, then btrfs_mark_ordered_io_finished() will handle
 		 * the ordered extent accounting for the range.
 		 */
-		btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
-					       offset, bytes);
+		btrfs_folio_clamp_clear_ordered(inode->root->fs_info,
+						page_folio(page), offset, bytes);
 		put_page(page);
 	}
 
@@ -1037,7 +1046,7 @@ free_pages:
 	if (pages) {
 		for (i = 0; i < nr_pages; i++) {
 			WARN_ON(pages[i]->mapping);
-			put_page(pages[i]);
+			btrfs_free_compr_page(pages[i]);
 		}
 		kfree(pages);
 	}
@@ -1052,7 +1061,7 @@ static void free_async_extent_pages(struct async_extent *async_extent)
 
 	for (i = 0; i < async_extent->nr_pages; i++) {
 		WARN_ON(async_extent->pages[i]->mapping);
-		put_page(async_extent->pages[i]);
+		btrfs_free_compr_page(async_extent->pages[i]);
 	}
 	kfree(async_extent->pages);
 	async_extent->nr_pages = 0;
@@ -2793,7 +2802,7 @@ out_page:
 					       PAGE_SIZE, !ret);
 		clear_page_dirty_for_io(page);
 	}
-	btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
+	btrfs_folio_clear_checked(fs_info, page_folio(page), page_start, PAGE_SIZE);
 	unlock_page(page);
 	put_page(page);
 	kfree(fixup);
@@ -2848,7 +2857,7 @@ int btrfs_writepage_cow_fixup(struct page *page)
 	 * page->mapping outside of the page lock.
 	 */
 	ihold(inode);
-	btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
+	btrfs_folio_set_checked(fs_info, page_folio(page), page_offset(page), PAGE_SIZE);
 	get_page(page);
 	btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL);
 	fixup->page = page;
@@ -3118,7 +3127,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
 						ordered_extent->disk_num_bytes);
 		}
 	}
-	unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset,
+	unpin_extent_cache(inode, ordered_extent->file_offset,
 			   ordered_extent->num_bytes, trans->transid);
 	if (ret < 0) {
 		btrfs_abort_transaction(trans, ret);
@@ -3796,7 +3805,7 @@ cache_index:
 	 * cache.
 	 *
 	 * This is required for both inode re-read from disk and delayed inode
-	 * in delayed_nodes_tree.
+	 * in the delayed_nodes xarray.
 	 */
 	if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info))
 		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
@@ -4725,7 +4734,7 @@ again:
 	/*
 	 * We unlock the page after the io is completed and then re-lock it
 	 * above.  release_folio() could have come in between that and cleared
-	 * PagePrivate(), but left the page in the mapping.  Set the page mapped
+	 * folio private, but left the page in the mapping.  Set the page mapped
 	 * here to make sure it's properly set for the subpage stuff.
 	 */
 	ret = set_page_extent_mapped(page);
@@ -4767,9 +4776,10 @@ again:
 			memzero_page(page, (block_start - page_offset(page)) + offset,
 				     len);
 	}
-	btrfs_page_clear_checked(fs_info, page, block_start,
-				 block_end + 1 - block_start);
-	btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
+	btrfs_folio_clear_checked(fs_info, page_folio(page), block_start,
+				  block_end + 1 - block_start);
+	btrfs_folio_set_dirty(fs_info, page_folio(page), block_start,
+			      block_end + 1 - block_start);
 	unlock_extent(io_tree, block_start, block_end, &cached_state);
 
 	if (only_release_metadata)
@@ -4889,7 +4899,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
 		last_byte = ALIGN(last_byte, fs_info->sectorsize);
 		hole_size = last_byte - cur_offset;
 
-		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+		if (!(em->flags & EXTENT_FLAG_PREALLOC)) {
 			struct extent_map *hole_em;
 
 			err = maybe_insert_hole(inode, cur_offset, hole_size);
@@ -4917,7 +4927,6 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
 			hole_em->block_len = 0;
 			hole_em->orig_block_len = 0;
 			hole_em->ram_bytes = hole_size;
-			hole_em->compress_type = BTRFS_COMPRESS_NONE;
 			hole_em->generation = btrfs_get_fs_generation(fs_info);
 
 			err = btrfs_replace_extent_map_range(inode, hole_em, true);
@@ -6217,6 +6226,13 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
 	inode->i_generation = BTRFS_I(inode)->generation;
 
 	/*
+	 * We don't have any capability xattrs set here yet, shortcut any
+	 * queries for the xattrs here.  If we add them later via the inode
+	 * security init path or any other path this flag will be cleared.
+	 */
+	set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
+
+	/*
 	 * Subvolumes don't inherit flags from their parent directory.
 	 * Originally this was probably by accident, but we probably can't
 	 * change it now without compatibility issues.
@@ -7258,13 +7274,11 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
 	em->orig_block_len = orig_block_len;
 	em->ram_bytes = ram_bytes;
 	em->generation = -1;
-	set_bit(EXTENT_FLAG_PINNED, &em->flags);
-	if (type == BTRFS_ORDERED_PREALLOC) {
-		set_bit(EXTENT_FLAG_FILLING, &em->flags);
-	} else if (type == BTRFS_ORDERED_COMPRESSED) {
-		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
-		em->compress_type = compress_type;
-	}
+	em->flags |= EXTENT_FLAG_PINNED;
+	if (type == BTRFS_ORDERED_PREALLOC)
+		em->flags |= EXTENT_FLAG_FILLING;
+	else if (type == BTRFS_ORDERED_COMPRESSED)
+		extent_map_set_compression(em, compress_type);
 
 	ret = btrfs_replace_extent_map_range(inode, em, true);
 	if (ret) {
@@ -7304,10 +7318,10 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
 	 * just use the extent.
 	 *
 	 */
-	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+	if ((em->flags & EXTENT_FLAG_PREALLOC) ||
 	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
 	     em->block_start != EXTENT_MAP_HOLE)) {
-		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+		if (em->flags & EXTENT_FLAG_PREALLOC)
 			type = BTRFS_ORDERED_PREALLOC;
 		else
 			type = BTRFS_ORDERED_NOCOW;
@@ -7542,7 +7556,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
 	 * to buffered IO.  Don't blame me, this is the price we pay for using
 	 * the generic code.
 	 */
-	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
+	if (extent_map_is_compressed(em) ||
 	    em->block_start == EXTENT_MAP_INLINE) {
 		free_extent_map(em);
 		/*
@@ -7638,7 +7652,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
 	 * that, since we have locked only the parts we are performing I/O in.
 	 */
 	if ((em->block_start == EXTENT_MAP_HOLE) ||
-	    (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
+	    ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {
 		iomap->addr = IOMAP_NULL_ADDR;
 		iomap->type = IOMAP_HOLE;
 	} else {
@@ -7851,13 +7865,14 @@ static void btrfs_readahead(struct readahead_control *rac)
 static void wait_subpage_spinlock(struct page *page)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+	struct folio *folio = page_folio(page);
 	struct btrfs_subpage *subpage;
 
-	if (!btrfs_is_subpage(fs_info, page))
+	if (!btrfs_is_subpage(fs_info, page->mapping))
 		return;
 
-	ASSERT(PagePrivate(page) && page->private);
-	subpage = (struct btrfs_subpage *)page->private;
+	ASSERT(folio_test_private(folio) && folio_get_private(folio));
+	subpage = folio_get_private(folio);
 
 	/*
 	 * This may look insane as we just acquire the spinlock and release it,
@@ -7995,7 +8010,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
 				page_end);
 		ASSERT(range_end + 1 - cur < U32_MAX);
 		range_len = range_end + 1 - cur;
-		if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) {
+		if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) {
 			/*
 			 * If Ordered (Private2) is cleared, it means endio has
 			 * already been executed for the range.
@@ -8004,7 +8019,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
 			 */
 			goto next;
 		}
-		btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len);
+		btrfs_folio_clear_ordered(fs_info, folio, cur, range_len);
 
 		/*
 		 * IO on this page will never be started, so we need to account
@@ -8074,7 +8089,7 @@ next:
 	 * did something wrong.
 	 */
 	ASSERT(!folio_test_ordered(folio));
-	btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio));
+	btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
 	if (!inode_evicting)
 		__btrfs_release_folio(folio, GFP_NOFS);
 	clear_page_extent_mapped(&folio->page);
@@ -8098,6 +8113,7 @@ next:
 vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
+	struct folio *folio = page_folio(page);
 	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@@ -8114,6 +8130,8 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
 	u64 page_end;
 	u64 end;
 
+	ASSERT(folio_order(folio) == 0);
+
 	reserved_space = PAGE_SIZE;
 
 	sb_start_pagefault(inode->i_sb);
@@ -8217,9 +8235,9 @@ again:
 	if (zero_start != PAGE_SIZE)
 		memzero_page(page, zero_start, PAGE_SIZE - zero_start);
 
-	btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
-	btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
-	btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
+	btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
+	btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start);
+	btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start);
 
 	btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
 
@@ -8462,10 +8480,20 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 	struct btrfs_inode *ei;
 	struct inode *inode;
+	struct extent_io_tree *file_extent_tree = NULL;
+
+	/* Self tests may pass a NULL fs_info. */
+	if (fs_info && !btrfs_fs_incompat(fs_info, NO_HOLES)) {
+		file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL);
+		if (!file_extent_tree)
+			return NULL;
+	}
 
 	ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
-	if (!ei)
+	if (!ei) {
+		kfree(file_extent_tree);
 		return NULL;
+	}
 
 	ei->root = NULL;
 	ei->generation = 0;
@@ -8501,10 +8529,18 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 
 	inode = &ei->vfs_inode;
 	extent_map_tree_init(&ei->extent_tree);
+
+	/* This io tree sets the valid inode. */
 	extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
 	ei->io_tree.inode = ei;
-	extent_io_tree_init(fs_info, &ei->file_extent_tree,
-			    IO_TREE_INODE_FILE_EXTENT);
+
+	ei->file_extent_tree = file_extent_tree;
+	if (file_extent_tree) {
+		extent_io_tree_init(fs_info, ei->file_extent_tree,
+				    IO_TREE_INODE_FILE_EXTENT);
+		/* Lockdep class is set only for the file extent tree. */
+		lockdep_set_class(&ei->file_extent_tree->lock, &file_extent_tree_class);
+	}
 	mutex_init(&ei->log_mutex);
 	spin_lock_init(&ei->ordered_tree_lock);
 	ei->ordered_tree = RB_ROOT;
@@ -8521,12 +8557,14 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 void btrfs_test_destroy_inode(struct inode *inode)
 {
 	btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
+	kfree(BTRFS_I(inode)->file_extent_tree);
 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
 #endif
 
 void btrfs_free_inode(struct inode *inode)
 {
+	kfree(BTRFS_I(inode)->file_extent_tree);
 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
 
@@ -9632,7 +9670,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 		em->block_len = ins.offset;
 		em->orig_block_len = ins.offset;
 		em->ram_bytes = ins.offset;
-		set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+		em->flags |= EXTENT_FLAG_PREALLOC;
 		em->generation = trans->transid;
 
 		ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
@@ -9785,7 +9823,9 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
 		page = find_get_page(inode->vfs_inode.i_mapping, index);
 		ASSERT(page); /* Pages should be in the extent_io_tree */
 
-		btrfs_page_set_writeback(fs_info, page, start, len);
+		/* This is for data, which doesn't yet support larger folio. */
+		ASSERT(folio_order(page_folio(page)) == 0);
+		btrfs_folio_set_writeback(fs_info, page_folio(page), start, len);
 		put_page(page);
 		index++;
 	}
@@ -9994,7 +10034,7 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
 	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
 	if (!pages)
 		return -ENOMEM;
-	ret = btrfs_alloc_page_array(nr_pages, pages);
+	ret = btrfs_alloc_page_array(nr_pages, pages, 0);
 	if (ret) {
 		ret = -ENOMEM;
 		goto out;
@@ -10113,12 +10153,12 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
 	encoded->len = min_t(u64, extent_map_end(em),
 			     inode->vfs_inode.i_size) - iocb->ki_pos;
 	if (em->block_start == EXTENT_MAP_HOLE ||
-	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+	    (em->flags & EXTENT_FLAG_PREALLOC)) {
 		disk_bytenr = EXTENT_MAP_HOLE;
 		count = min_t(u64, count, encoded->len);
 		encoded->len = count;
 		encoded->unencoded_len = count;
-	} else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+	} else if (extent_map_is_compressed(em)) {
 		disk_bytenr = em->block_start;
 		/*
 		 * Bail if the buffer isn't large enough to return the whole
@@ -10133,7 +10173,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
 		encoded->unencoded_len = em->ram_bytes;
 		encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
 		ret = btrfs_encoded_io_compression_from_extent(fs_info,
-							     em->compress_type);
+							       extent_map_compression(em));
 		if (ret < 0)
 			goto out_em;
 		encoded->compression = ret;
@@ -10564,6 +10604,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct extent_state *cached_state = NULL;
 	struct extent_map *em = NULL;
+	struct btrfs_chunk_map *map = NULL;
 	struct btrfs_device *device = NULL;
 	struct btrfs_swap_info bsi = {
 		.lowest_ppage = (sector_t)-1ULL,
@@ -10680,7 +10721,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 			ret = -EINVAL;
 			goto out;
 		}
-		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+		if (extent_map_is_compressed(em)) {
 			btrfs_warn(fs_info, "swapfile must not be compressed");
 			ret = -EINVAL;
 			goto out;
@@ -10703,13 +10744,13 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 			goto out;
 		}
 
-		em = btrfs_get_chunk_map(fs_info, logical_block_start, len);
-		if (IS_ERR(em)) {
-			ret = PTR_ERR(em);
+		map = btrfs_get_chunk_map(fs_info, logical_block_start, len);
+		if (IS_ERR(map)) {
+			ret = PTR_ERR(map);
 			goto out;
 		}
 
-		if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+		if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
 			btrfs_warn(fs_info,
 				   "swapfile must have single data profile");
 			ret = -EINVAL;
@@ -10717,23 +10758,23 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 		}
 
 		if (device == NULL) {
-			device = em->map_lookup->stripes[0].dev;
+			device = map->stripes[0].dev;
 			ret = btrfs_add_swapfile_pin(inode, device, false);
 			if (ret == 1)
 				ret = 0;
 			else if (ret)
 				goto out;
-		} else if (device != em->map_lookup->stripes[0].dev) {
+		} else if (device != map->stripes[0].dev) {
 			btrfs_warn(fs_info, "swapfile must be on one device");
 			ret = -EINVAL;
 			goto out;
 		}
 
-		physical_block_start = (em->map_lookup->stripes[0].physical +
-					(logical_block_start - em->start));
-		len = min(len, em->len - (logical_block_start - em->start));
-		free_extent_map(em);
-		em = NULL;
+		physical_block_start = (map->stripes[0].physical +
+					(logical_block_start - map->start));
+		len = min(len, map->chunk_len - (logical_block_start - map->start));
+		btrfs_free_chunk_map(map);
+		map = NULL;
 
 		bg = btrfs_lookup_block_group(fs_info, logical_block_start);
 		if (!bg) {
@@ -10786,6 +10827,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 out:
 	if (!IS_ERR_OR_NULL(em))
 		free_extent_map(em);
+	if (!IS_ERR_OR_NULL(map))
+		btrfs_free_chunk_map(map);
 
 	unlock_extent(io_tree, 0, isize - 1, &cached_state);
 
diff --git a/fs/btrfs/lru_cache.c b/fs/btrfs/lru_cache.c
index 0fe0ae54ac67..fd88af17d8d9 100644
--- a/fs/btrfs/lru_cache.c
+++ b/fs/btrfs/lru_cache.c
@@ -9,7 +9,7 @@
  *
  * @cache:      The cache.
  * @max_size:   Maximum size (number of entries) for the cache.
- *              Use 0 for unlimited size, it's the user's responsability to
+ *              Use 0 for unlimited size, it's the user's responsibility to
  *              trim the cache in that case.
  */
 void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size)
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index d3fcfc628a4f..1131d5a29d61 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -152,7 +152,7 @@ static int copy_compressed_data_to_page(char *compressed_data,
 	cur_page = out_pages[*cur_out / PAGE_SIZE];
 	/* Allocate a new page */
 	if (!cur_page) {
-		cur_page = alloc_page(GFP_NOFS);
+		cur_page = btrfs_alloc_compr_page();
 		if (!cur_page)
 			return -ENOMEM;
 		out_pages[*cur_out / PAGE_SIZE] = cur_page;
@@ -178,7 +178,7 @@ static int copy_compressed_data_to_page(char *compressed_data,
 		cur_page = out_pages[*cur_out / PAGE_SIZE];
 		/* Allocate a new page */
 		if (!cur_page) {
-			cur_page = alloc_page(GFP_NOFS);
+			cur_page = btrfs_alloc_compr_page();
 			if (!cur_page)
 				return -ENOMEM;
 			out_pages[*cur_out / PAGE_SIZE] = cur_page;
diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
index b8f9c9e56c8c..cdada4865837 100644
--- a/fs/btrfs/messages.c
+++ b/fs/btrfs/messages.c
@@ -287,7 +287,7 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
  * panic or BUGs, depending on mount options.
  */
 __cold
-void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+void __btrfs_panic(const struct btrfs_fs_info *fs_info, const char *function,
 		   unsigned int line, int error, const char *fmt, ...)
 {
 	char *s_id = "<unknown>";
diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
index 4d04c1fa5899..08a9272399d2 100644
--- a/fs/btrfs/messages.h
+++ b/fs/btrfs/messages.h
@@ -194,7 +194,7 @@ const char * __attribute_const__ btrfs_decode_error(int error);
 
 __printf(5, 6)
 __cold
-void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+void __btrfs_panic(const struct btrfs_fs_info *fs_info, const char *function,
 		   unsigned int line, int error, const char *fmt, ...);
 /*
  * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a82e1417c4d2..59850dc17b22 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -323,9 +323,10 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
 		 *
 		 * If there's no such bit, we need to skip to next range.
 		 */
-		if (!btrfs_page_test_ordered(fs_info, page, file_offset, len))
+		if (!btrfs_folio_test_ordered(fs_info, page_folio(page),
+					      file_offset, len))
 			return false;
-		btrfs_page_clear_ordered(fs_info, page, file_offset, len);
+		btrfs_folio_clear_ordered(fs_info, page_folio(page), file_offset, len);
 	}
 
 	/* Now we're fine to update the accounting. */
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 567a6d3d4712..127ef8bf0ffd 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -97,13 +97,6 @@ struct btrfs_ordered_extent {
 	u64 bytes_left;
 
 	/*
-	 * the end of the ordered extent which is behind it but
-	 * didn't update disk_i_size. Please see the comment of
-	 * btrfs_ordered_update_i_size();
-	 */
-	u64 outstanding_isize;
-
-	/*
 	 * If we get truncated we need to adjust the file extent we enter for
 	 * this ordered extent so that we do not expose stale data.
 	 */
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index e46774e8f49f..63b426cc7798 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -194,7 +194,7 @@ static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
  *
  * Must be called with qgroup_lock held and @prealloc preallocated.
  *
- * The control on the lifespan of @prealloc would be transfered to this
+ * The control on the lifespan of @prealloc would be transferred to this
  * function, thus caller should no longer touch @prealloc.
  */
 static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 3e014b9370a3..792c8e17c31d 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -964,7 +964,7 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
 {
 	int ret;
 
-	ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages);
+	ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, 0);
 	if (ret < 0)
 		return ret;
 	/* Mapping all sectors */
@@ -979,7 +979,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
 	int ret;
 
 	ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
-				     rbio->stripe_pages + data_pages);
+				     rbio->stripe_pages + data_pages, 0);
 	if (ret < 0)
 		return ret;
 
@@ -1530,7 +1530,7 @@ static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
 	const int data_pages = rbio->nr_data * rbio->stripe_npages;
 	int ret;
 
-	ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages);
+	ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, 0);
 	if (ret < 0)
 		return ret;
 
@@ -1549,7 +1549,6 @@ struct btrfs_plug_cb {
 	struct blk_plug_cb cb;
 	struct btrfs_fs_info *info;
 	struct list_head rbio_list;
-	struct work_struct work;
 };
 
 /*
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 45e6ff78316f..470213688872 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -164,7 +164,7 @@ struct raid56_bio_trace_info {
 	u8 stripe_nr;
 };
 
-static inline int nr_data_stripes(const struct map_lookup *map)
+static inline int nr_data_stripes(const struct btrfs_chunk_map *map)
 {
 	return map->num_stripes - btrfs_nr_parity_stripes(map->type);
 }
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index f88b0c2ac3fe..ae90894dc7dc 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -141,9 +141,9 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
 	if (datal < block_size)
 		memzero_page(page, datal, block_size - datal);
 
-	btrfs_page_set_uptodate(fs_info, page, file_offset, block_size);
-	btrfs_page_clear_checked(fs_info, page, file_offset, block_size);
-	btrfs_page_set_dirty(fs_info, page, file_offset, block_size);
+	btrfs_folio_set_uptodate(fs_info, page_folio(page), file_offset, block_size);
+	btrfs_folio_clear_checked(fs_info, page_folio(page), file_offset, block_size);
+	btrfs_folio_set_dirty(fs_info, page_folio(page), file_offset, block_size);
 out_unlock:
 	if (page) {
 		unlock_page(page);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index f5d9e5f74a52..abe594f77f99 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2895,7 +2895,7 @@ static noinline_for_stack int prealloc_file_extent_cluster(
 		 * will re-read the whole page anyway.
 		 */
 		if (page) {
-			btrfs_subpage_clear_uptodate(fs_info, page, i_size,
+			btrfs_subpage_clear_uptodate(fs_info, page_folio(page), i_size,
 					round_up(i_size, PAGE_SIZE) - i_size);
 			unlock_page(page);
 			put_page(page);
@@ -2951,7 +2951,7 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod
 	em->len = end + 1 - start;
 	em->block_len = em->len;
 	em->block_start = block_start;
-	set_bit(EXTENT_FLAG_PINNED, &em->flags);
+	em->flags |= EXTENT_FLAG_PINNED;
 
 	lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
 	ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false);
@@ -3070,7 +3070,8 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
 						       clamped_len);
 			goto release_page;
 		}
-		btrfs_page_set_dirty(fs_info, page, clamped_start, clamped_len);
+		btrfs_folio_set_dirty(fs_info, page_folio(page),
+				      clamped_start, clamped_len);
 
 		/*
 		 * Set the boundary if it's inside the page.
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f62a408671cb..a01807cbd4d4 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -43,7 +43,7 @@ struct scrub_ctx;
 /*
  * The following value only influences the performance.
  *
- * This detemines how many stripes would be submitted in one go,
+ * This determines how many stripes would be submitted in one go,
  * which is 512KiB (BTRFS_STRIPE_LEN * SCRUB_STRIPES_PER_GROUP).
  */
 #define SCRUB_STRIPES_PER_GROUP		8
@@ -192,7 +192,6 @@ struct scrub_ctx {
 	int			cur_stripe;
 	atomic_t		cancel_req;
 	int			readonly;
-	int			sectors_per_bio;
 
 	/* State of IO submission throttling affecting the associated device */
 	ktime_t			throttle_deadline;
@@ -262,7 +261,7 @@ static int init_scrub_stripe(struct btrfs_fs_info *fs_info,
 	atomic_set(&stripe->pending_io, 0);
 	spin_lock_init(&stripe->write_error_lock);
 
-	ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages);
+	ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages, 0);
 	if (ret < 0)
 		goto error;
 
@@ -710,7 +709,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
 	/* Metadata, verify the full tree block. */
 	if (sector->is_metadata) {
 		/*
-		 * Check if the tree block crosses the stripe boudary.  If
+		 * Check if the tree block crosses the stripe boundary.  If
 		 * crossed the boundary, we cannot verify it but only give a
 		 * warning.
 		 *
@@ -884,7 +883,7 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
 	/*
 	 * Init needed infos for error reporting.
 	 *
-	 * Although our scrub_stripe infrastucture is mostly based on btrfs_submit_bio()
+	 * Although our scrub_stripe infrastructure is mostly based on btrfs_submit_bio()
 	 * thus no need for dev/physical, error reporting still needs dev and physical.
 	 */
 	if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) {
@@ -1280,7 +1279,7 @@ static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *d
  * return 0 if it is a data stripe, 1 means parity stripe.
  */
 static int get_raid56_logic_offset(u64 physical, int num,
-				   struct map_lookup *map, u64 *offset,
+				   struct btrfs_chunk_map *map, u64 *offset,
 				   u64 *stripe_start)
 {
 	int i;
@@ -1409,14 +1408,11 @@ search_forward:
 		if (ret > 0)
 			break;
 next:
-		path->slots[0]++;
-		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
-			ret = btrfs_next_leaf(extent_root, path);
-			if (ret) {
-				/* Either no more item or fatal error */
-				btrfs_release_path(path);
-				return ret;
-			}
+		ret = btrfs_next_item(extent_root, path);
+		if (ret) {
+			/* Either no more items or a fatal error. */
+			btrfs_release_path(path);
+			return ret;
 		}
 	}
 	btrfs_release_path(path);
@@ -1816,7 +1812,7 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx)
 	if (sctx->is_dev_replace) {
 		/*
 		 * For dev-replace, if we know there is something wrong with
-		 * metadata, we should immedately abort.
+		 * metadata, we should immediately abort.
 		 */
 		for (int i = 0; i < nr_stripes; i++) {
 			if (stripe_has_metadata_error(&sctx->stripes[i])) {
@@ -1898,7 +1894,7 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *
 static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
 				      struct btrfs_device *scrub_dev,
 				      struct btrfs_block_group *bg,
-				      struct map_lookup *map,
+				      struct btrfs_chunk_map *map,
 				      u64 full_stripe_start)
 {
 	DECLARE_COMPLETION_ONSTACK(io_done);
@@ -2067,7 +2063,7 @@ out:
  */
 static int scrub_simple_mirror(struct scrub_ctx *sctx,
 			       struct btrfs_block_group *bg,
-			       struct map_lookup *map,
+			       struct btrfs_chunk_map *map,
 			       u64 logical_start, u64 logical_length,
 			       struct btrfs_device *device,
 			       u64 physical, int mirror_num)
@@ -2128,7 +2124,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
 }
 
 /* Calculate the full stripe length for simple stripe based profiles */
-static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
+static u64 simple_stripe_full_stripe_len(const struct btrfs_chunk_map *map)
 {
 	ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
 			    BTRFS_BLOCK_GROUP_RAID10));
@@ -2137,7 +2133,7 @@ static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
 }
 
 /* Get the logical bytenr for the stripe */
-static u64 simple_stripe_get_logical(struct map_lookup *map,
+static u64 simple_stripe_get_logical(struct btrfs_chunk_map *map,
 				     struct btrfs_block_group *bg,
 				     int stripe_index)
 {
@@ -2154,7 +2150,7 @@ static u64 simple_stripe_get_logical(struct map_lookup *map,
 }
 
 /* Get the mirror number for the stripe */
-static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
+static int simple_stripe_mirror_num(struct btrfs_chunk_map *map, int stripe_index)
 {
 	ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
 			    BTRFS_BLOCK_GROUP_RAID10));
@@ -2166,7 +2162,7 @@ static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
 
 static int scrub_simple_stripe(struct scrub_ctx *sctx,
 			       struct btrfs_block_group *bg,
-			       struct map_lookup *map,
+			       struct btrfs_chunk_map *map,
 			       struct btrfs_device *device,
 			       int stripe_index)
 {
@@ -2199,18 +2195,17 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx,
 
 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 					   struct btrfs_block_group *bg,
-					   struct extent_map *em,
+					   struct btrfs_chunk_map *map,
 					   struct btrfs_device *scrub_dev,
 					   int stripe_index)
 {
 	struct btrfs_fs_info *fs_info = sctx->fs_info;
-	struct map_lookup *map = em->map_lookup;
 	const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
 	const u64 chunk_logical = bg->start;
 	int ret;
 	int ret2;
 	u64 physical = map->stripes[stripe_index].physical;
-	const u64 dev_stripe_len = btrfs_calc_stripe_length(em);
+	const u64 dev_stripe_len = btrfs_calc_stripe_length(map);
 	const u64 physical_end = physical + dev_stripe_len;
 	u64 logical;
 	u64 logic_end;
@@ -2373,17 +2368,12 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
 					  u64 dev_extent_len)
 {
 	struct btrfs_fs_info *fs_info = sctx->fs_info;
-	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
-	struct map_lookup *map;
-	struct extent_map *em;
+	struct btrfs_chunk_map *map;
 	int i;
 	int ret = 0;
 
-	read_lock(&map_tree->lock);
-	em = lookup_extent_mapping(map_tree, bg->start, bg->length);
-	read_unlock(&map_tree->lock);
-
-	if (!em) {
+	map = btrfs_find_chunk_map(fs_info, bg->start, bg->length);
+	if (!map) {
 		/*
 		 * Might have been an unused block group deleted by the cleaner
 		 * kthread or relocation.
@@ -2395,22 +2385,21 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
 
 		return ret;
 	}
-	if (em->start != bg->start)
+	if (map->start != bg->start)
 		goto out;
-	if (em->len < dev_extent_len)
+	if (map->chunk_len < dev_extent_len)
 		goto out;
 
-	map = em->map_lookup;
 	for (i = 0; i < map->num_stripes; ++i) {
 		if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
 		    map->stripes[i].physical == dev_offset) {
-			ret = scrub_stripe(sctx, bg, em, scrub_dev, i);
+			ret = scrub_stripe(sctx, bg, map, scrub_dev, i);
 			if (ret)
 				goto out;
 		}
 	}
 out:
-	free_extent_map(em);
+	btrfs_free_chunk_map(map);
 
 	return ret;
 }
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 2347cf15278b..93511d54abf8 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -64,7 +64,7 @@
  *   This means a slightly higher tree locking latency.
  */
 
-bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page)
+bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping)
 {
 	if (fs_info->sectorsize >= PAGE_SIZE)
 		return false;
@@ -74,8 +74,7 @@ bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page)
 	 * mapping. And if page->mapping->host is data inode, it's subpage.
 	 * As we have ruled our sectorsize >= PAGE_SIZE case already.
 	 */
-	if (!page->mapping || !page->mapping->host ||
-	    is_data_inode(page->mapping->host))
+	if (!mapping || !mapping->host || is_data_inode(mapping->host))
 		return true;
 
 	/*
@@ -116,7 +115,7 @@ void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sector
 }
 
 int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
-			 struct page *page, enum btrfs_subpage_type type)
+			 struct folio *folio, enum btrfs_subpage_type type)
 {
 	struct btrfs_subpage *subpage;
 
@@ -124,31 +123,30 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
 	 * We have cases like a dummy extent buffer page, which is not mapped
 	 * and doesn't need to be locked.
 	 */
-	if (page->mapping)
-		ASSERT(PageLocked(page));
+	if (folio->mapping)
+		ASSERT(folio_test_locked(folio));
 
-	/* Either not subpage, or the page already has private attached */
-	if (!btrfs_is_subpage(fs_info, page) || PagePrivate(page))
+	/* Either not subpage, or the folio already has private attached. */
+	if (!btrfs_is_subpage(fs_info, folio->mapping) || folio_test_private(folio))
 		return 0;
 
 	subpage = btrfs_alloc_subpage(fs_info, type);
 	if (IS_ERR(subpage))
 		return  PTR_ERR(subpage);
 
-	attach_page_private(page, subpage);
+	folio_attach_private(folio, subpage);
 	return 0;
 }
 
-void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
-			  struct page *page)
+void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio)
 {
 	struct btrfs_subpage *subpage;
 
-	/* Either not subpage, or already detached */
-	if (!btrfs_is_subpage(fs_info, page) || !PagePrivate(page))
+	/* Either not subpage, or the folio already has private attached. */
+	if (!btrfs_is_subpage(fs_info, folio->mapping) || !folio_test_private(folio))
 		return;
 
-	subpage = detach_page_private(page);
+	subpage = folio_detach_private(folio);
 	ASSERT(subpage);
 	btrfs_free_subpage(subpage);
 }
@@ -188,77 +186,78 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage)
  * This is important for eb allocation, to prevent race with last eb freeing
  * of the same page.
  * With the eb_refs increased before the eb inserted into radix tree,
- * detach_extent_buffer_page() won't detach the page private while we're still
+ * detach_extent_buffer_page() won't detach the folio private while we're still
  * allocating the extent buffer.
  */
-void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
-			    struct page *page)
+void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio)
 {
 	struct btrfs_subpage *subpage;
 
-	if (!btrfs_is_subpage(fs_info, page))
+	if (!btrfs_is_subpage(fs_info, folio->mapping))
 		return;
 
-	ASSERT(PagePrivate(page) && page->mapping);
-	lockdep_assert_held(&page->mapping->i_private_lock);
+	ASSERT(folio_test_private(folio) && folio->mapping);
+	lockdep_assert_held(&folio->mapping->i_private_lock);
 
-	subpage = (struct btrfs_subpage *)page->private;
+	subpage = folio_get_private(folio);
 	atomic_inc(&subpage->eb_refs);
 }
 
-void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
-			    struct page *page)
+void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio)
 {
 	struct btrfs_subpage *subpage;
 
-	if (!btrfs_is_subpage(fs_info, page))
+	if (!btrfs_is_subpage(fs_info, folio->mapping))
 		return;
 
-	ASSERT(PagePrivate(page) && page->mapping);
-	lockdep_assert_held(&page->mapping->i_private_lock);
+	ASSERT(folio_test_private(folio) && folio->mapping);
+	lockdep_assert_held(&folio->mapping->i_private_lock);
 
-	subpage = (struct btrfs_subpage *)page->private;
+	subpage = folio_get_private(folio);
 	ASSERT(atomic_read(&subpage->eb_refs));
 	atomic_dec(&subpage->eb_refs);
 }
 
 static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
-		struct page *page, u64 start, u32 len)
+				 struct folio *folio, u64 start, u32 len)
 {
+	/* For subpage support, the folio must be single page. */
+	ASSERT(folio_order(folio) == 0);
+
 	/* Basic checks */
-	ASSERT(PagePrivate(page) && page->private);
+	ASSERT(folio_test_private(folio) && folio_get_private(folio));
 	ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
 	       IS_ALIGNED(len, fs_info->sectorsize));
 	/*
 	 * The range check only works for mapped page, we can still have
 	 * unmapped page like dummy extent buffer pages.
 	 */
-	if (page->mapping)
-		ASSERT(page_offset(page) <= start &&
-		       start + len <= page_offset(page) + PAGE_SIZE);
+	if (folio->mapping)
+		ASSERT(folio_pos(folio) <= start &&
+		       start + len <= folio_pos(folio) + PAGE_SIZE);
 }
 
 void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
-		struct page *page, u64 start, u32 len)
+				struct folio *folio, u64 start, u32 len)
 {
-	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+	struct btrfs_subpage *subpage = folio_get_private(folio);
 	const int nbits = len >> fs_info->sectorsize_bits;
 
-	btrfs_subpage_assert(fs_info, page, start, len);
+	btrfs_subpage_assert(fs_info, folio, start, len);
 
 	atomic_add(nbits, &subpage->readers);
 }
 
 void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
-		struct page *page, u64 start, u32 len)
+			      struct folio *folio, u64 start, u32 len)
 {
-	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+	struct btrfs_subpage *subpage = folio_get_private(folio);
 	const int nbits = len >> fs_info->sectorsize_bits;
 	bool is_data;
 	bool last;
 
-	btrfs_subpage_assert(fs_info, page, start, len);
-	is_data = is_data_inode(page->mapping->host);
+	btrfs_subpage_assert(fs_info, folio, start, len);
+	is_data = is_data_inode(folio->mapping->host);
 	ASSERT(atomic_read(&subpage->readers) >= nbits);
 	last = atomic_sub_and_test(nbits, &subpage->readers);
 
@@ -270,35 +269,35 @@ void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
 	 * As we want the atomic_sub_and_test() to be always executed.
 	 */
 	if (is_data && last)
-		unlock_page(page);
+		folio_unlock(folio);
 }
 
-static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len)
+static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len)
 {
 	u64 orig_start = *start;
 	u32 orig_len = *len;
 
-	*start = max_t(u64, page_offset(page), orig_start);
+	*start = max_t(u64, folio_pos(folio), orig_start);
 	/*
 	 * For certain call sites like btrfs_drop_pages(), we may have pages
 	 * beyond the target range. In that case, just set @len to 0, subpage
 	 * helpers can handle @len == 0 without any problem.
 	 */
-	if (page_offset(page) >= orig_start + orig_len)
+	if (folio_pos(folio) >= orig_start + orig_len)
 		*len = 0;
 	else
-		*len = min_t(u64, page_offset(page) + PAGE_SIZE,
+		*len = min_t(u64, folio_pos(folio) + PAGE_SIZE,
 			     orig_start + orig_len) - *start;
 }
 
 void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
-		struct page *page, u64 start, u32 len)
+				struct folio *folio, u64 start, u32 len)
 {
-	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+	struct btrfs_subpage *subpage = folio_get_private(folio);
 	const int nbits = (len >> fs_info->sectorsize_bits);
 	int ret;
 
-	btrfs_subpage_assert(fs_info, page, start, len);
+	btrfs_subpage_assert(fs_info, folio, start, len);
 
 	ASSERT(atomic_read(&subpage->readers) == 0);
 	ret = atomic_add_return(nbits, &subpage->writers);
@@ -306,12 +305,12 @@ void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
 }
 
 bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
-		struct page *page, u64 start, u32 len)
+				       struct folio *folio, u64 start, u32 len)
 {
-	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+	struct btrfs_subpage *subpage = folio_get_private(folio);
 	const int nbits = (len >> fs_info->sectorsize_bits);
 
-	btrfs_subpage_assert(fs_info, page, start, len);
+	btrfs_subpage_assert(fs_info, folio, start, len);
 
 	/*
 	 * We have call sites passing @lock_page into
@@ -328,7 +327,7 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
 }
 
 /*
- * Lock a page for delalloc page writeback.
+ * Lock a folio for delalloc page writeback.
  *
  * Return -EAGAIN if the page is not properly initialized.
  * Return 0 with the page locked, and writer counter updated.
@@ -337,38 +336,40 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
  * it's really the correct page, as the caller is using
  * filemap_get_folios_contig(), which can race with page invalidating.
  */
-int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
-		struct page *page, u64 start, u32 len)
+int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info,
+				  struct folio *folio, u64 start, u32 len)
 {
-	if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) {
-		lock_page(page);
+	if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) {
+		folio_lock(folio);
 		return 0;
 	}
-	lock_page(page);
-	if (!PagePrivate(page) || !page->private) {
-		unlock_page(page);
+	folio_lock(folio);
+	if (!folio_test_private(folio) || !folio_get_private(folio)) {
+		folio_unlock(folio);
 		return -EAGAIN;
 	}
-	btrfs_subpage_clamp_range(page, &start, &len);
-	btrfs_subpage_start_writer(fs_info, page, start, len);
+	btrfs_subpage_clamp_range(folio, &start, &len);
+	btrfs_subpage_start_writer(fs_info, folio, start, len);
 	return 0;
 }
 
-void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
-		struct page *page, u64 start, u32 len)
+void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
+				 struct folio *folio, u64 start, u32 len)
 {
-	if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page))
-		return unlock_page(page);
-	btrfs_subpage_clamp_range(page, &start, &len);
-	if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len))
-		unlock_page(page);
+	if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) {
+		folio_unlock(folio);
+		return;
+	}
+	btrfs_subpage_clamp_range(folio, &start, &len);
+	if (btrfs_subpage_end_and_test_writer(fs_info, folio, start, len))
+		folio_unlock(folio);
 }
 
-#define subpage_calc_start_bit(fs_info, page, name, start, len)		\
+#define subpage_calc_start_bit(fs_info, folio, name, start, len)	\
 ({									\
 	unsigned int start_bit;						\
 									\
-	btrfs_subpage_assert(fs_info, page, start, len);		\
+	btrfs_subpage_assert(fs_info, folio, start, len);		\
 	start_bit = offset_in_page(start) >> fs_info->sectorsize_bits;	\
 	start_bit += fs_info->subpage_info->name##_offset;		\
 	start_bit;							\
@@ -385,46 +386,46 @@ void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
 			fs_info->subpage_info->bitmap_nr_bits)
 
 void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info,
-		struct page *page, u64 start, u32 len)
+				struct folio *folio, u64 start, u32 len)
 {
-	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
-	unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+	struct btrfs_subpage *subpage = folio_get_private(folio);
+	unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
 							uptodate, start, len);
 	unsigned long flags;
 
 	spin_lock_irqsave(&subpage->lock, flags);
 	bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
 	if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate))
-		SetPageUptodate(page);
+		folio_mark_uptodate(folio);
 	spin_unlock_irqrestore(&subpage->lock, flags);
 }
 
 void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info,
-		struct page *page, u64 start, u32 len)
+				  struct folio *folio, u64 start, u32 len)
 {
-	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
-	unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+	struct btrfs_subpage *subpage = folio_get_private(folio);
+	unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
 							uptodate, start, len);
 	unsigned long flags;
 
 	spin_lock_irqsave(&subpage->lock, flags);
 	bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
-	ClearPageUptodate(page);
+	folio_clear_uptodate(folio);
 	spin_unlock_irqrestore(&subpage->lock, flags);
 }
 
 void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
-		struct page *page, u64 start, u32 len)
+			     struct folio *folio, u64 start, u32 len)
 {
-	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
-	unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+	struct btrfs_subpage *subpage = folio_get_private(folio);
+	unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
 							dirty, start, len);
 	unsigned long flags;
 
 	spin_lock_irqsave(&subpage->lock, flags);
 	bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
 	spin_unlock_irqrestore(&subpage->lock, flags);
-	set_page_dirty(page);
+	folio_mark_dirty(folio);
 }
 
 /*
@@ -438,10 +439,10 @@ void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
  * extra handling for tree blocks.
  */
 bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
-		struct page *page, u64 start, u32 len)
+					struct folio *folio, u64 start, u32 len)
 {
-	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
-	unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+	struct btrfs_subpage *subpage = folio_get_private(folio);
+	unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
 							dirty, start, len);
 	unsigned long flags;
 	bool last = false;
@@ -455,101 +456,101 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
 }
 
 void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info,
-		struct page *page, u64 start, u32 len)
+			       struct folio *folio, u64 start, u32 len)
 {
 	bool last;
 
-	last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len);
+	last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, start, len);
 	if (last)
-		clear_page_dirty_for_io(page);
+		folio_clear_dirty_for_io(folio);
 }
 
 void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
-		struct page *page, u64 start, u32 len)
+				 struct folio *folio, u64 start, u32 len)
 {
-	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
-	unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+	struct btrfs_subpage *subpage = folio_get_private(folio);
+	unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
 							writeback, start, len);
 	unsigned long flags;
 
 	spin_lock_irqsave(&subpage->lock, flags);
 	bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
-	set_page_writeback(page);
+	folio_start_writeback(folio);
 	spin_unlock_irqrestore(&subpage->lock, flags);
 }
 
 void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
-		struct page *page, u64 start, u32 len)
+				   struct folio *folio, u64 start, u32 len)
 {
-	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
-	unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+	struct btrfs_subpage *subpage = folio_get_private(folio);
+	unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
 							writeback, start, len);
 	unsigned long flags;
 
 	spin_lock_irqsave(&subpage->lock, flags);
 	bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
 	if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) {
-		ASSERT(PageWriteback(page));
-		end_page_writeback(page);
+		ASSERT(folio_test_writeback(folio));
+		folio_end_writeback(folio);
 	}
 	spin_unlock_irqrestore(&subpage->lock, flags);
 }
 
 void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info,
-		struct page *page, u64 start, u32 len)
+			       struct folio *folio, u64 start, u32 len)
 {
-	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
-	unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+	struct btrfs_subpage *subpage = folio_get_private(folio);
+	unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
 							ordered, start, len);
 	unsigned long flags;
 
 	spin_lock_irqsave(&subpage->lock, flags);
 	bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
-	SetPageOrdered(page);
+	folio_set_ordered(folio);
 	spin_unlock_irqrestore(&subpage->lock, flags);
 }
 
 void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info,
-		struct page *page, u64 start, u32 len)
+				 struct folio *folio, u64 start, u32 len)
 {
-	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
-	unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+	struct btrfs_subpage *subpage = folio_get_private(folio);
+	unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
 							ordered, start, len);
 	unsigned long flags;
 
 	spin_lock_irqsave(&subpage->lock, flags);
 	bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
 	if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered))
-		ClearPageOrdered(page);
+		folio_clear_ordered(folio);
 	spin_unlock_irqrestore(&subpage->lock, flags);
 }
 
 void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info,
-			       struct page *page, u64 start, u32 len)
+			       struct folio *folio, u64 start, u32 len)
 {
-	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
-	unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+	struct btrfs_subpage *subpage = folio_get_private(folio);
+	unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
 							checked, start, len);
 	unsigned long flags;
 
 	spin_lock_irqsave(&subpage->lock, flags);
 	bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
 	if (subpage_test_bitmap_all_set(fs_info, subpage, checked))
-		SetPageChecked(page);
+		folio_set_checked(folio);
 	spin_unlock_irqrestore(&subpage->lock, flags);
 }
 
 void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info,
-				 struct page *page, u64 start, u32 len)
+				 struct folio *folio, u64 start, u32 len)
 {
-	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
-	unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+	struct btrfs_subpage *subpage = folio_get_private(folio);
+	unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
 							checked, start, len);
 	unsigned long flags;
 
 	spin_lock_irqsave(&subpage->lock, flags);
 	bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
-	ClearPageChecked(page);
+	folio_clear_checked(folio);
 	spin_unlock_irqrestore(&subpage->lock, flags);
 }
 
@@ -559,10 +560,10 @@ void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info,
  */
 #define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name)				\
 bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info,	\
-		struct page *page, u64 start, u32 len)			\
+			       struct folio *folio, u64 start, u32 len)	\
 {									\
-	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \
-	unsigned int start_bit = subpage_calc_start_bit(fs_info, page,	\
+	struct btrfs_subpage *subpage = folio_get_private(folio);	\
+	unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,	\
 						name, start, len);	\
 	unsigned long flags;						\
 	bool ret;							\
@@ -584,88 +585,94 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked);
  * in.  We only test sectorsize == PAGE_SIZE cases so far, thus we can fall
  * back to regular sectorsize branch.
  */
-#define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func,	\
-			       test_page_func)				\
-void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info,		\
-		struct page *page, u64 start, u32 len)			\
+#define IMPLEMENT_BTRFS_PAGE_OPS(name, folio_set_func,			\
+				 folio_clear_func, folio_test_func)	\
+void btrfs_folio_set_##name(const struct btrfs_fs_info *fs_info,	\
+			    struct folio *folio, u64 start, u32 len)	\
 {									\
-	if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) {	\
-		set_page_func(page);					\
+	if (unlikely(!fs_info) ||					\
+	    !btrfs_is_subpage(fs_info, folio->mapping)) {		\
+		folio_set_func(folio);					\
 		return;							\
 	}								\
-	btrfs_subpage_set_##name(fs_info, page, start, len);		\
+	btrfs_subpage_set_##name(fs_info, folio, start, len);		\
 }									\
-void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info,	\
-		struct page *page, u64 start, u32 len)			\
+void btrfs_folio_clear_##name(const struct btrfs_fs_info *fs_info,	\
+			      struct folio *folio, u64 start, u32 len)	\
 {									\
-	if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) {	\
-		clear_page_func(page);					\
+	if (unlikely(!fs_info) ||					\
+	    !btrfs_is_subpage(fs_info, folio->mapping)) {		\
+		folio_clear_func(folio);				\
 		return;							\
 	}								\
-	btrfs_subpage_clear_##name(fs_info, page, start, len);		\
+	btrfs_subpage_clear_##name(fs_info, folio, start, len);		\
 }									\
-bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info,	\
-		struct page *page, u64 start, u32 len)			\
+bool btrfs_folio_test_##name(const struct btrfs_fs_info *fs_info,	\
+			     struct folio *folio, u64 start, u32 len)	\
 {									\
-	if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page))	\
-		return test_page_func(page);				\
-	return btrfs_subpage_test_##name(fs_info, page, start, len);	\
+	if (unlikely(!fs_info) ||					\
+	    !btrfs_is_subpage(fs_info, folio->mapping))			\
+		return folio_test_func(folio);				\
+	return btrfs_subpage_test_##name(fs_info, folio, start, len);	\
 }									\
-void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info,	\
-		struct page *page, u64 start, u32 len)			\
+void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info,	\
+				  struct folio *folio, u64 start, u32 len) \
 {									\
-	if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) {	\
-		set_page_func(page);					\
+	if (unlikely(!fs_info) ||					\
+	    !btrfs_is_subpage(fs_info, folio->mapping)) {		\
+		folio_set_func(folio);					\
 		return;							\
 	}								\
-	btrfs_subpage_clamp_range(page, &start, &len);			\
-	btrfs_subpage_set_##name(fs_info, page, start, len);		\
+	btrfs_subpage_clamp_range(folio, &start, &len);			\
+	btrfs_subpage_set_##name(fs_info, folio, start, len);		\
 }									\
-void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
-		struct page *page, u64 start, u32 len)			\
+void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
+				    struct folio *folio, u64 start, u32 len) \
 {									\
-	if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) {	\
-		clear_page_func(page);					\
+	if (unlikely(!fs_info) ||					\
+	    !btrfs_is_subpage(fs_info, folio->mapping)) {		\
+		folio_clear_func(folio);				\
 		return;							\
 	}								\
-	btrfs_subpage_clamp_range(page, &start, &len);			\
-	btrfs_subpage_clear_##name(fs_info, page, start, len);		\
+	btrfs_subpage_clamp_range(folio, &start, &len);			\
+	btrfs_subpage_clear_##name(fs_info, folio, start, len);		\
 }									\
-bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info,	\
-		struct page *page, u64 start, u32 len)			\
+bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info,	\
+				   struct folio *folio, u64 start, u32 len) \
 {									\
-	if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page))	\
-		return test_page_func(page);				\
-	btrfs_subpage_clamp_range(page, &start, &len);			\
-	return btrfs_subpage_test_##name(fs_info, page, start, len);	\
-}
-IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate,
-			 PageUptodate);
-IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io,
-			 PageDirty);
-IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback,
-			 PageWriteback);
-IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered,
-			 PageOrdered);
-IMPLEMENT_BTRFS_PAGE_OPS(checked, SetPageChecked, ClearPageChecked, PageChecked);
+	if (unlikely(!fs_info) ||					\
+	    !btrfs_is_subpage(fs_info, folio->mapping))			\
+		return folio_test_func(folio);				\
+	btrfs_subpage_clamp_range(folio, &start, &len);			\
+	return btrfs_subpage_test_##name(fs_info, folio, start, len);	\
+}
+IMPLEMENT_BTRFS_PAGE_OPS(uptodate, folio_mark_uptodate, folio_clear_uptodate,
+			 folio_test_uptodate);
+IMPLEMENT_BTRFS_PAGE_OPS(dirty, folio_mark_dirty, folio_clear_dirty_for_io,
+			 folio_test_dirty);
+IMPLEMENT_BTRFS_PAGE_OPS(writeback, folio_start_writeback, folio_end_writeback,
+			 folio_test_writeback);
+IMPLEMENT_BTRFS_PAGE_OPS(ordered, folio_set_ordered, folio_clear_ordered,
+			 folio_test_ordered);
+IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked,
+			 folio_test_checked);
 
 /*
  * Make sure not only the page dirty bit is cleared, but also subpage dirty bit
  * is cleared.
  */
-void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
-				 struct page *page)
+void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio)
 {
-	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+	struct btrfs_subpage *subpage = folio_get_private(folio);
 
 	if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
 		return;
 
-	ASSERT(!PageDirty(page));
-	if (!btrfs_is_subpage(fs_info, page))
+	ASSERT(!folio_test_dirty(folio));
+	if (!btrfs_is_subpage(fs_info, folio->mapping))
 		return;
 
-	ASSERT(PagePrivate(page) && page->private);
+	ASSERT(folio_test_private(folio) && folio_get_private(folio));
 	ASSERT(subpage_test_bitmap_all_zero(fs_info, subpage, dirty));
 }
 
@@ -684,18 +691,20 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
  *   extent_write_locked_range().
  *   In this case, we have to call subpage helper to handle the case.
  */
-void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
-			      u64 start, u32 len)
+void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info,
+			       struct folio *folio, u64 start, u32 len)
 {
 	struct btrfs_subpage *subpage;
 
-	ASSERT(PageLocked(page));
+	ASSERT(folio_test_locked(folio));
 	/* For non-subpage case, we just unlock the page */
-	if (!btrfs_is_subpage(fs_info, page))
-		return unlock_page(page);
+	if (!btrfs_is_subpage(fs_info, folio->mapping)) {
+		folio_unlock(folio);
+		return;
+	}
 
-	ASSERT(PagePrivate(page) && page->private);
-	subpage = (struct btrfs_subpage *)page->private;
+	ASSERT(folio_test_private(folio) && folio_get_private(folio));
+	subpage = folio_get_private(folio);
 
 	/*
 	 * For subpage case, there are two types of locked page.  With or
@@ -704,12 +713,14 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
 	 * Since we own the page lock, no one else could touch subpage::writers
 	 * and we are safe to do several atomic operations without spinlock.
 	 */
-	if (atomic_read(&subpage->writers) == 0)
+	if (atomic_read(&subpage->writers) == 0) {
 		/* No writers, locked by plain lock_page() */
-		return unlock_page(page);
+		folio_unlock(folio);
+		return;
+	}
 
 	/* Have writers, use proper subpage helper to end it */
-	btrfs_page_end_writer_lock(fs_info, page, start, len);
+	btrfs_folio_end_writer_lock(fs_info, folio, start, len);
 }
 
 #define GET_SUBPAGE_BITMAP(subpage, subpage_info, name, dst)		\
@@ -717,7 +728,7 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
 		   subpage_info->name##_offset, subpage_info->bitmap_nr_bits)
 
 void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
-				      struct page *page, u64 start, u32 len)
+				      struct folio *folio, u64 start, u32 len)
 {
 	struct btrfs_subpage_info *subpage_info = fs_info->subpage_info;
 	struct btrfs_subpage *subpage;
@@ -729,9 +740,9 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
 	unsigned long checked_bitmap;
 	unsigned long flags;
 
-	ASSERT(PagePrivate(page) && page->private);
+	ASSERT(folio_test_private(folio) && folio_get_private(folio));
 	ASSERT(subpage_info);
-	subpage = (struct btrfs_subpage *)page->private;
+	subpage = folio_get_private(folio);
 
 	spin_lock_irqsave(&subpage->lock, flags);
 	GET_SUBPAGE_BITMAP(subpage, subpage_info, uptodate, &uptodate_bitmap);
@@ -741,10 +752,10 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
 	GET_SUBPAGE_BITMAP(subpage, subpage_info, checked, &checked_bitmap);
 	spin_unlock_irqrestore(&subpage->lock, flags);
 
-	dump_page(page, "btrfs subpage dump");
+	dump_page(folio_page(folio, 0), "btrfs subpage dump");
 	btrfs_warn(fs_info,
 "start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl error=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl",
-		    start, len, page_offset(page),
+		    start, len, folio_pos(folio),
 		    subpage_info->bitmap_nr_bits, &uptodate_bitmap,
 		    subpage_info->bitmap_nr_bits, &error_bitmap,
 		    subpage_info->bitmap_nr_bits, &dirty_bitmap,
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 5cbf67ccbdeb..793c2b314a58 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -73,71 +73,68 @@ enum btrfs_subpage_type {
 	BTRFS_SUBPAGE_DATA,
 };
 
-bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page);
+bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping);
 
 void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize);
 int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
-			 struct page *page, enum btrfs_subpage_type type);
-void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
-			  struct page *page);
+			 struct folio *folio, enum btrfs_subpage_type type);
+void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio);
 
 /* Allocate additional data where page represents more than one sector */
 struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
 					  enum btrfs_subpage_type type);
 void btrfs_free_subpage(struct btrfs_subpage *subpage);
 
-void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
-			    struct page *page);
-void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
-			    struct page *page);
+void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
+void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
 
 void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
-		struct page *page, u64 start, u32 len);
+				struct folio *folio, u64 start, u32 len);
 void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
-		struct page *page, u64 start, u32 len);
+			      struct folio *folio, u64 start, u32 len);
 
 void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
-		struct page *page, u64 start, u32 len);
+				struct folio *folio, u64 start, u32 len);
 bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
-		struct page *page, u64 start, u32 len);
-int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
-		struct page *page, u64 start, u32 len);
-void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
-		struct page *page, u64 start, u32 len);
+				       struct folio *folio, u64 start, u32 len);
+int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info,
+				  struct folio *folio, u64 start, u32 len);
+void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
+				 struct folio *folio, u64 start, u32 len);
 
 /*
  * Template for subpage related operations.
  *
- * btrfs_subpage_*() are for call sites where the page has subpage attached and
- * the range is ensured to be inside the page.
+ * btrfs_subpage_*() are for call sites where the folio has subpage attached and
+ * the range is ensured to be inside the folio's single page.
  *
- * btrfs_page_*() are for call sites where the page can either be subpage
- * specific or regular page. The function will handle both cases.
- * But the range still needs to be inside the page.
+ * btrfs_folio_*() are for call sites where the page can either be subpage
+ * specific or regular folios. The function will handle both cases.
+ * But the range still needs to be inside one single page.
  *
- * btrfs_page_clamp_*() are similar to btrfs_page_*(), except the range doesn't
+ * btrfs_folio_clamp_*() are similar to btrfs_folio_*(), except the range doesn't
  * need to be inside the page. Those functions will truncate the range
  * automatically.
  */
 #define DECLARE_BTRFS_SUBPAGE_OPS(name)					\
 void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info,	\
-		struct page *page, u64 start, u32 len);			\
+		struct folio *folio, u64 start, u32 len);			\
 void btrfs_subpage_clear_##name(const struct btrfs_fs_info *fs_info,	\
-		struct page *page, u64 start, u32 len);			\
+		struct folio *folio, u64 start, u32 len);			\
 bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info,	\
-		struct page *page, u64 start, u32 len);			\
-void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info,		\
-		struct page *page, u64 start, u32 len);			\
-void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info,	\
-		struct page *page, u64 start, u32 len);			\
-bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info,	\
-		struct page *page, u64 start, u32 len);			\
-void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info,	\
-		struct page *page, u64 start, u32 len);			\
-void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info,	\
-		struct page *page, u64 start, u32 len);			\
-bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info,	\
-		struct page *page, u64 start, u32 len);
+		struct folio *folio, u64 start, u32 len);			\
+void btrfs_folio_set_##name(const struct btrfs_fs_info *fs_info,	\
+		struct folio *folio, u64 start, u32 len);			\
+void btrfs_folio_clear_##name(const struct btrfs_fs_info *fs_info,	\
+		struct folio *folio, u64 start, u32 len);			\
+bool btrfs_folio_test_##name(const struct btrfs_fs_info *fs_info,	\
+		struct folio *folio, u64 start, u32 len);			\
+void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info,	\
+		struct folio *folio, u64 start, u32 len);			\
+void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info,	\
+		struct folio *folio, u64 start, u32 len);			\
+bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info,	\
+		struct folio *folio, u64 start, u32 len);
 
 DECLARE_BTRFS_SUBPAGE_OPS(uptodate);
 DECLARE_BTRFS_SUBPAGE_OPS(dirty);
@@ -146,13 +143,12 @@ DECLARE_BTRFS_SUBPAGE_OPS(ordered);
 DECLARE_BTRFS_SUBPAGE_OPS(checked);
 
 bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
-		struct page *page, u64 start, u32 len);
+					struct folio *folio, u64 start, u32 len);
 
-void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
-				 struct page *page);
-void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
-			      u64 start, u32 len);
+void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio);
+void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info,
+			       struct folio *folio, u64 start, u32 len);
 void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
-				      struct page *page, u64 start, u32 len);
+				      struct folio *folio, u64 start, u32 len);
 
 #endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2a3a5bf102dc..896acfda1789 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -27,6 +27,7 @@
 #include <linux/crc32c.h>
 #include <linux/btrfs.h>
 #include <linux/security.h>
+#include <linux/fs_parser.h>
 #include "messages.h"
 #include "delayed-inode.h"
 #include "ctree.h"
@@ -64,19 +65,7 @@
 #include <trace/events/btrfs.h>
 
 static const struct super_operations btrfs_super_ops;
-
-/*
- * Types for mounting the default subvolume and a subvolume explicitly
- * requested by subvol=/path. That way the callchain is straightforward and we
- * don't have to play tricks with the mount options and recursive calls to
- * btrfs_mount.
- *
- * The new btrfs_root_fs_type also servers as a tag for the bdev_holder.
- */
 static struct file_system_type btrfs_fs_type;
-static struct file_system_type btrfs_root_fs_type;
-
-static int btrfs_remount(struct super_block *sb, int *flags, char *data);
 
 static void btrfs_put_super(struct super_block *sb)
 {
@@ -86,8 +75,22 @@ static void btrfs_put_super(struct super_block *sb)
 	close_ctree(fs_info);
 }
 
+/* Store the mount options related information. */
+struct btrfs_fs_context {
+	char *subvol_name;
+	u64 subvol_objectid;
+	u64 max_inline;
+	u32 commit_interval;
+	u32 metadata_ratio;
+	u32 thread_pool_size;
+	unsigned long mount_opt;
+	unsigned long compress_type:4;
+	unsigned int compress_level;
+	refcount_t refs;
+};
+
 enum {
-	Opt_acl, Opt_noacl,
+	Opt_acl,
 	Opt_clear_cache,
 	Opt_commit_interval,
 	Opt_compress,
@@ -97,27 +100,26 @@ enum {
 	Opt_degraded,
 	Opt_device,
 	Opt_fatal_errors,
-	Opt_flushoncommit, Opt_noflushoncommit,
+	Opt_flushoncommit,
 	Opt_max_inline,
-	Opt_barrier, Opt_nobarrier,
-	Opt_datacow, Opt_nodatacow,
-	Opt_datasum, Opt_nodatasum,
-	Opt_defrag, Opt_nodefrag,
-	Opt_discard, Opt_nodiscard,
+	Opt_barrier,
+	Opt_datacow,
+	Opt_datasum,
+	Opt_defrag,
+	Opt_discard,
 	Opt_discard_mode,
-	Opt_norecovery,
 	Opt_ratio,
 	Opt_rescan_uuid_tree,
 	Opt_skip_balance,
-	Opt_space_cache, Opt_no_space_cache,
+	Opt_space_cache,
 	Opt_space_cache_version,
-	Opt_ssd, Opt_nossd,
-	Opt_ssd_spread, Opt_nossd_spread,
+	Opt_ssd,
+	Opt_ssd_spread,
 	Opt_subvol,
 	Opt_subvol_empty,
 	Opt_subvolid,
 	Opt_thread_pool,
-	Opt_treelog, Opt_notreelog,
+	Opt_treelog,
 	Opt_user_subvol_rm_allowed,
 
 	/* Rescue options */
@@ -128,14 +130,10 @@ enum {
 	Opt_ignoredatacsums,
 	Opt_rescue_all,
 
-	/* Deprecated options */
-	Opt_recovery,
-	Opt_inode_cache, Opt_noinode_cache,
-
 	/* Debugging options */
-	Opt_enospc_debug, Opt_noenospc_debug,
+	Opt_enospc_debug,
 #ifdef CONFIG_BTRFS_DEBUG
-	Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
+	Opt_fragment, Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
 #endif
 #ifdef CONFIG_BTRFS_FS_REF_VERIFY
 	Opt_ref_verify,
@@ -143,785 +141,611 @@ enum {
 	Opt_err,
 };
 
-static const match_table_t tokens = {
-	{Opt_acl, "acl"},
-	{Opt_noacl, "noacl"},
-	{Opt_clear_cache, "clear_cache"},
-	{Opt_commit_interval, "commit=%u"},
-	{Opt_compress, "compress"},
-	{Opt_compress_type, "compress=%s"},
-	{Opt_compress_force, "compress-force"},
-	{Opt_compress_force_type, "compress-force=%s"},
-	{Opt_degraded, "degraded"},
-	{Opt_device, "device=%s"},
-	{Opt_fatal_errors, "fatal_errors=%s"},
-	{Opt_flushoncommit, "flushoncommit"},
-	{Opt_noflushoncommit, "noflushoncommit"},
-	{Opt_inode_cache, "inode_cache"},
-	{Opt_noinode_cache, "noinode_cache"},
-	{Opt_max_inline, "max_inline=%s"},
-	{Opt_barrier, "barrier"},
-	{Opt_nobarrier, "nobarrier"},
-	{Opt_datacow, "datacow"},
-	{Opt_nodatacow, "nodatacow"},
-	{Opt_datasum, "datasum"},
-	{Opt_nodatasum, "nodatasum"},
-	{Opt_defrag, "autodefrag"},
-	{Opt_nodefrag, "noautodefrag"},
-	{Opt_discard, "discard"},
-	{Opt_discard_mode, "discard=%s"},
-	{Opt_nodiscard, "nodiscard"},
-	{Opt_norecovery, "norecovery"},
-	{Opt_ratio, "metadata_ratio=%u"},
-	{Opt_rescan_uuid_tree, "rescan_uuid_tree"},
-	{Opt_skip_balance, "skip_balance"},
-	{Opt_space_cache, "space_cache"},
-	{Opt_no_space_cache, "nospace_cache"},
-	{Opt_space_cache_version, "space_cache=%s"},
-	{Opt_ssd, "ssd"},
-	{Opt_nossd, "nossd"},
-	{Opt_ssd_spread, "ssd_spread"},
-	{Opt_nossd_spread, "nossd_spread"},
-	{Opt_subvol, "subvol=%s"},
-	{Opt_subvol_empty, "subvol="},
-	{Opt_subvolid, "subvolid=%s"},
-	{Opt_thread_pool, "thread_pool=%u"},
-	{Opt_treelog, "treelog"},
-	{Opt_notreelog, "notreelog"},
-	{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
+enum {
+	Opt_fatal_errors_panic,
+	Opt_fatal_errors_bug,
+};
 
-	/* Rescue options */
-	{Opt_rescue, "rescue=%s"},
+static const struct constant_table btrfs_parameter_fatal_errors[] = {
+	{ "panic", Opt_fatal_errors_panic },
+	{ "bug", Opt_fatal_errors_bug },
+	{}
+};
+
+enum {
+	Opt_discard_sync,
+	Opt_discard_async,
+};
+
+static const struct constant_table btrfs_parameter_discard[] = {
+	{ "sync", Opt_discard_sync },
+	{ "async", Opt_discard_async },
+	{}
+};
+
+enum {
+	Opt_space_cache_v1,
+	Opt_space_cache_v2,
+};
+
+static const struct constant_table btrfs_parameter_space_cache[] = {
+	{ "v1", Opt_space_cache_v1 },
+	{ "v2", Opt_space_cache_v2 },
+	{}
+};
+
+enum {
+	Opt_rescue_usebackuproot,
+	Opt_rescue_nologreplay,
+	Opt_rescue_ignorebadroots,
+	Opt_rescue_ignoredatacsums,
+	Opt_rescue_parameter_all,
+};
+
+static const struct constant_table btrfs_parameter_rescue[] = {
+	{ "usebackuproot", Opt_rescue_usebackuproot },
+	{ "nologreplay", Opt_rescue_nologreplay },
+	{ "ignorebadroots", Opt_rescue_ignorebadroots },
+	{ "ibadroots", Opt_rescue_ignorebadroots },
+	{ "ignoredatacsums", Opt_rescue_ignoredatacsums },
+	{ "idatacsums", Opt_rescue_ignoredatacsums },
+	{ "all", Opt_rescue_parameter_all },
+	{}
+};
+
+#ifdef CONFIG_BTRFS_DEBUG
+enum {
+	Opt_fragment_parameter_data,
+	Opt_fragment_parameter_metadata,
+	Opt_fragment_parameter_all,
+};
+
+static const struct constant_table btrfs_parameter_fragment[] = {
+	{ "data", Opt_fragment_parameter_data },
+	{ "metadata", Opt_fragment_parameter_metadata },
+	{ "all", Opt_fragment_parameter_all },
+	{}
+};
+#endif
+
+static const struct fs_parameter_spec btrfs_fs_parameters[] = {
+	fsparam_flag_no("acl", Opt_acl),
+	fsparam_flag_no("autodefrag", Opt_defrag),
+	fsparam_flag_no("barrier", Opt_barrier),
+	fsparam_flag("clear_cache", Opt_clear_cache),
+	fsparam_u32("commit", Opt_commit_interval),
+	fsparam_flag("compress", Opt_compress),
+	fsparam_string("compress", Opt_compress_type),
+	fsparam_flag("compress-force", Opt_compress_force),
+	fsparam_string("compress-force", Opt_compress_force_type),
+	fsparam_flag_no("datacow", Opt_datacow),
+	fsparam_flag_no("datasum", Opt_datasum),
+	fsparam_flag("degraded", Opt_degraded),
+	fsparam_string("device", Opt_device),
+	fsparam_flag_no("discard", Opt_discard),
+	fsparam_enum("discard", Opt_discard_mode, btrfs_parameter_discard),
+	fsparam_enum("fatal_errors", Opt_fatal_errors, btrfs_parameter_fatal_errors),
+	fsparam_flag_no("flushoncommit", Opt_flushoncommit),
+	fsparam_string("max_inline", Opt_max_inline),
+	fsparam_u32("metadata_ratio", Opt_ratio),
+	fsparam_flag("rescan_uuid_tree", Opt_rescan_uuid_tree),
+	fsparam_flag("skip_balance", Opt_skip_balance),
+	fsparam_flag_no("space_cache", Opt_space_cache),
+	fsparam_enum("space_cache", Opt_space_cache_version, btrfs_parameter_space_cache),
+	fsparam_flag_no("ssd", Opt_ssd),
+	fsparam_flag_no("ssd_spread", Opt_ssd_spread),
+	fsparam_string("subvol", Opt_subvol),
+	fsparam_flag("subvol=", Opt_subvol_empty),
+	fsparam_u64("subvolid", Opt_subvolid),
+	fsparam_u32("thread_pool", Opt_thread_pool),
+	fsparam_flag_no("treelog", Opt_treelog),
+	fsparam_flag("user_subvol_rm_allowed", Opt_user_subvol_rm_allowed),
+
+	/* Rescue options. */
+	fsparam_enum("rescue", Opt_rescue, btrfs_parameter_rescue),
 	/* Deprecated, with alias rescue=nologreplay */
-	{Opt_nologreplay, "nologreplay"},
+	__fsparam(NULL, "nologreplay", Opt_nologreplay, fs_param_deprecated, NULL),
 	/* Deprecated, with alias rescue=usebackuproot */
-	{Opt_usebackuproot, "usebackuproot"},
+	__fsparam(NULL, "usebackuproot", Opt_usebackuproot, fs_param_deprecated, NULL),
 
-	/* Deprecated options */
-	{Opt_recovery, "recovery"},
-
-	/* Debugging options */
-	{Opt_enospc_debug, "enospc_debug"},
-	{Opt_noenospc_debug, "noenospc_debug"},
+	/* Debugging options. */
+	fsparam_flag_no("enospc_debug", Opt_enospc_debug),
 #ifdef CONFIG_BTRFS_DEBUG
-	{Opt_fragment_data, "fragment=data"},
-	{Opt_fragment_metadata, "fragment=metadata"},
-	{Opt_fragment_all, "fragment=all"},
+	fsparam_enum("fragment", Opt_fragment, btrfs_parameter_fragment),
 #endif
 #ifdef CONFIG_BTRFS_FS_REF_VERIFY
-	{Opt_ref_verify, "ref_verify"},
+	fsparam_flag("ref_verify", Opt_ref_verify),
 #endif
-	{Opt_err, NULL},
+	{}
 };
 
-static const match_table_t rescue_tokens = {
-	{Opt_usebackuproot, "usebackuproot"},
-	{Opt_nologreplay, "nologreplay"},
-	{Opt_ignorebadroots, "ignorebadroots"},
-	{Opt_ignorebadroots, "ibadroots"},
-	{Opt_ignoredatacsums, "ignoredatacsums"},
-	{Opt_ignoredatacsums, "idatacsums"},
-	{Opt_rescue_all, "all"},
-	{Opt_err, NULL},
-};
-
-static bool check_ro_option(struct btrfs_fs_info *fs_info, unsigned long opt,
-			    const char *opt_name)
+/* No support for restricting writes to btrfs devices yet... */
+static inline blk_mode_t btrfs_open_mode(struct fs_context *fc)
 {
-	if (fs_info->mount_opt & opt) {
-		btrfs_err(fs_info, "%s must be used with ro mount option",
-			  opt_name);
-		return true;
-	}
-	return false;
+	return sb_open_mode(fc->sb_flags) & ~BLK_OPEN_RESTRICT_WRITES;
 }
 
-static int parse_rescue_options(struct btrfs_fs_info *info, const char *options)
+static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
-	char *opts;
-	char *orig;
-	char *p;
-	substring_t args[MAX_OPT_ARGS];
-	int ret = 0;
+	struct btrfs_fs_context *ctx = fc->fs_private;
+	struct fs_parse_result result;
+	int opt;
 
-	opts = kstrdup(options, GFP_KERNEL);
-	if (!opts)
-		return -ENOMEM;
-	orig = opts;
+	opt = fs_parse(fc, btrfs_fs_parameters, param, &result);
+	if (opt < 0)
+		return opt;
 
-	while ((p = strsep(&opts, ":")) != NULL) {
-		int token;
+	switch (opt) {
+	case Opt_degraded:
+		btrfs_set_opt(ctx->mount_opt, DEGRADED);
+		break;
+	case Opt_subvol_empty:
+		/*
+		 * This exists because we used to allow it on accident, so we're
+		 * keeping it to maintain ABI.  See 37becec95ac3 ("Btrfs: allow
+		 * empty subvol= again").
+		 */
+		break;
+	case Opt_subvol:
+		kfree(ctx->subvol_name);
+		ctx->subvol_name = kstrdup(param->string, GFP_KERNEL);
+		if (!ctx->subvol_name)
+			return -ENOMEM;
+		break;
+	case Opt_subvolid:
+		ctx->subvol_objectid = result.uint_64;
 
-		if (!*p)
-			continue;
-		token = match_token(p, rescue_tokens, args);
-		switch (token){
-		case Opt_usebackuproot:
-			btrfs_info(info,
-				   "trying to use backup root at mount time");
-			btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
-			break;
-		case Opt_nologreplay:
-			btrfs_set_and_info(info, NOLOGREPLAY,
-					   "disabling log replay at mount time");
-			break;
-		case Opt_ignorebadroots:
-			btrfs_set_and_info(info, IGNOREBADROOTS,
-					   "ignoring bad roots");
-			break;
-		case Opt_ignoredatacsums:
-			btrfs_set_and_info(info, IGNOREDATACSUMS,
-					   "ignoring data csums");
-			break;
-		case Opt_rescue_all:
-			btrfs_info(info, "enabling all of the rescue options");
-			btrfs_set_and_info(info, IGNOREDATACSUMS,
-					   "ignoring data csums");
-			btrfs_set_and_info(info, IGNOREBADROOTS,
-					   "ignoring bad roots");
-			btrfs_set_and_info(info, NOLOGREPLAY,
-					   "disabling log replay at mount time");
-			break;
-		case Opt_err:
-			btrfs_info(info, "unrecognized rescue option '%s'", p);
-			ret = -EINVAL;
-			goto out;
-		default:
-			break;
-		}
+		/* subvolid=0 means give me the original fs_tree. */
+		if (!ctx->subvol_objectid)
+			ctx->subvol_objectid = BTRFS_FS_TREE_OBJECTID;
+		break;
+	case Opt_device: {
+		struct btrfs_device *device;
+		blk_mode_t mode = btrfs_open_mode(fc);
 
+		mutex_lock(&uuid_mutex);
+		device = btrfs_scan_one_device(param->string, mode, false);
+		mutex_unlock(&uuid_mutex);
+		if (IS_ERR(device))
+			return PTR_ERR(device);
+		break;
 	}
-out:
-	kfree(orig);
-	return ret;
-}
-
-/*
- * Regular mount options parser.  Everything that is needed only when
- * reading in a new superblock is parsed here.
- * XXX JDM: This needs to be cleaned up for remount.
- */
-int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
-			unsigned long new_flags)
-{
-	substring_t args[MAX_OPT_ARGS];
-	char *p, *num;
-	int intarg;
-	int ret = 0;
-	char *compress_type;
-	bool compress_force = false;
-	enum btrfs_compression_type saved_compress_type;
-	int saved_compress_level;
-	bool saved_compress_force;
-	int no_compress = 0;
-	const bool remounting = test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state);
-
-	if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE))
-		btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE);
-	else if (btrfs_free_space_cache_v1_active(info)) {
-		if (btrfs_is_zoned(info)) {
-			btrfs_info(info,
-			"zoned: clearing existing space cache");
-			btrfs_set_super_cache_generation(info->super_copy, 0);
+	case Opt_datasum:
+		if (result.negated) {
+			btrfs_set_opt(ctx->mount_opt, NODATASUM);
 		} else {
-			btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+			btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+			btrfs_clear_opt(ctx->mount_opt, NODATASUM);
 		}
-	}
-
-	/*
-	 * Even the options are empty, we still need to do extra check
-	 * against new flags
-	 */
-	if (!options)
-		goto check;
-
-	while ((p = strsep(&options, ",")) != NULL) {
-		int token;
-		if (!*p)
-			continue;
-
-		token = match_token(p, tokens, args);
-		switch (token) {
-		case Opt_degraded:
-			btrfs_info(info, "allowing degraded mounts");
-			btrfs_set_opt(info->mount_opt, DEGRADED);
-			break;
-		case Opt_subvol:
-		case Opt_subvol_empty:
-		case Opt_subvolid:
-		case Opt_device:
-			/*
-			 * These are parsed by btrfs_parse_subvol_options or
-			 * btrfs_parse_device_options and can be ignored here.
-			 */
-			break;
-		case Opt_nodatasum:
-			btrfs_set_and_info(info, NODATASUM,
-					   "setting nodatasum");
-			break;
-		case Opt_datasum:
-			if (btrfs_test_opt(info, NODATASUM)) {
-				if (btrfs_test_opt(info, NODATACOW))
-					btrfs_info(info,
-						   "setting datasum, datacow enabled");
-				else
-					btrfs_info(info, "setting datasum");
-			}
-			btrfs_clear_opt(info->mount_opt, NODATACOW);
-			btrfs_clear_opt(info->mount_opt, NODATASUM);
-			break;
-		case Opt_nodatacow:
-			if (!btrfs_test_opt(info, NODATACOW)) {
-				if (!btrfs_test_opt(info, COMPRESS) ||
-				    !btrfs_test_opt(info, FORCE_COMPRESS)) {
-					btrfs_info(info,
-						   "setting nodatacow, compression disabled");
-				} else {
-					btrfs_info(info, "setting nodatacow");
-				}
-			}
-			btrfs_clear_opt(info->mount_opt, COMPRESS);
-			btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
-			btrfs_set_opt(info->mount_opt, NODATACOW);
-			btrfs_set_opt(info->mount_opt, NODATASUM);
-			break;
-		case Opt_datacow:
-			btrfs_clear_and_info(info, NODATACOW,
-					     "setting datacow");
-			break;
-		case Opt_compress_force:
-		case Opt_compress_force_type:
-			compress_force = true;
-			fallthrough;
-		case Opt_compress:
-		case Opt_compress_type:
-			saved_compress_type = btrfs_test_opt(info,
-							     COMPRESS) ?
-				info->compress_type : BTRFS_COMPRESS_NONE;
-			saved_compress_force =
-				btrfs_test_opt(info, FORCE_COMPRESS);
-			saved_compress_level = info->compress_level;
-			if (token == Opt_compress ||
-			    token == Opt_compress_force ||
-			    strncmp(args[0].from, "zlib", 4) == 0) {
-				compress_type = "zlib";
-
-				info->compress_type = BTRFS_COMPRESS_ZLIB;
-				info->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;
-				/*
-				 * args[0] contains uninitialized data since
-				 * for these tokens we don't expect any
-				 * parameter.
-				 */
-				if (token != Opt_compress &&
-				    token != Opt_compress_force)
-					info->compress_level =
-					  btrfs_compress_str2level(
-							BTRFS_COMPRESS_ZLIB,
-							args[0].from + 4);
-				btrfs_set_opt(info->mount_opt, COMPRESS);
-				btrfs_clear_opt(info->mount_opt, NODATACOW);
-				btrfs_clear_opt(info->mount_opt, NODATASUM);
-				no_compress = 0;
-			} else if (strncmp(args[0].from, "lzo", 3) == 0) {
-				compress_type = "lzo";
-				info->compress_type = BTRFS_COMPRESS_LZO;
-				info->compress_level = 0;
-				btrfs_set_opt(info->mount_opt, COMPRESS);
-				btrfs_clear_opt(info->mount_opt, NODATACOW);
-				btrfs_clear_opt(info->mount_opt, NODATASUM);
-				btrfs_set_fs_incompat(info, COMPRESS_LZO);
-				no_compress = 0;
-			} else if (strncmp(args[0].from, "zstd", 4) == 0) {
-				compress_type = "zstd";
-				info->compress_type = BTRFS_COMPRESS_ZSTD;
-				info->compress_level =
-					btrfs_compress_str2level(
-							 BTRFS_COMPRESS_ZSTD,
-							 args[0].from + 4);
-				btrfs_set_opt(info->mount_opt, COMPRESS);
-				btrfs_clear_opt(info->mount_opt, NODATACOW);
-				btrfs_clear_opt(info->mount_opt, NODATASUM);
-				btrfs_set_fs_incompat(info, COMPRESS_ZSTD);
-				no_compress = 0;
-			} else if (strncmp(args[0].from, "no", 2) == 0) {
-				compress_type = "no";
-				info->compress_level = 0;
-				info->compress_type = 0;
-				btrfs_clear_opt(info->mount_opt, COMPRESS);
-				btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
-				compress_force = false;
-				no_compress++;
-			} else {
-				btrfs_err(info, "unrecognized compression value %s",
-					  args[0].from);
-				ret = -EINVAL;
-				goto out;
-			}
-
-			if (compress_force) {
-				btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
-			} else {
-				/*
-				 * If we remount from compress-force=xxx to
-				 * compress=xxx, we need clear FORCE_COMPRESS
-				 * flag, otherwise, there is no way for users
-				 * to disable forcible compression separately.
-				 */
-				btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
-			}
-			if (no_compress == 1) {
-				btrfs_info(info, "use no compression");
-			} else if ((info->compress_type != saved_compress_type) ||
-				   (compress_force != saved_compress_force) ||
-				   (info->compress_level != saved_compress_level)) {
-				btrfs_info(info, "%s %s compression, level %d",
-					   (compress_force) ? "force" : "use",
-					   compress_type, info->compress_level);
-			}
-			compress_force = false;
-			break;
-		case Opt_ssd:
-			btrfs_set_and_info(info, SSD,
-					   "enabling ssd optimizations");
-			btrfs_clear_opt(info->mount_opt, NOSSD);
-			break;
-		case Opt_ssd_spread:
-			btrfs_set_and_info(info, SSD,
-					   "enabling ssd optimizations");
-			btrfs_set_and_info(info, SSD_SPREAD,
-					   "using spread ssd allocation scheme");
-			btrfs_clear_opt(info->mount_opt, NOSSD);
-			break;
-		case Opt_nossd:
-			btrfs_set_opt(info->mount_opt, NOSSD);
-			btrfs_clear_and_info(info, SSD,
-					     "not using ssd optimizations");
-			fallthrough;
-		case Opt_nossd_spread:
-			btrfs_clear_and_info(info, SSD_SPREAD,
-					     "not using spread ssd allocation scheme");
-			break;
-		case Opt_barrier:
-			btrfs_clear_and_info(info, NOBARRIER,
-					     "turning on barriers");
-			break;
-		case Opt_nobarrier:
-			btrfs_set_and_info(info, NOBARRIER,
-					   "turning off barriers");
-			break;
-		case Opt_thread_pool:
-			ret = match_int(&args[0], &intarg);
-			if (ret) {
-				btrfs_err(info, "unrecognized thread_pool value %s",
-					  args[0].from);
-				goto out;
-			} else if (intarg == 0) {
-				btrfs_err(info, "invalid value 0 for thread_pool");
-				ret = -EINVAL;
-				goto out;
-			}
-			info->thread_pool_size = intarg;
-			break;
-		case Opt_max_inline:
-			num = match_strdup(&args[0]);
-			if (num) {
-				info->max_inline = memparse(num, NULL);
-				kfree(num);
-
-				if (info->max_inline) {
-					info->max_inline = min_t(u64,
-						info->max_inline,
-						info->sectorsize);
-				}
-				btrfs_info(info, "max_inline at %llu",
-					   info->max_inline);
-			} else {
-				ret = -ENOMEM;
-				goto out;
-			}
-			break;
-		case Opt_acl:
+		break;
+	case Opt_datacow:
+		if (result.negated) {
+			btrfs_clear_opt(ctx->mount_opt, COMPRESS);
+			btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
+			btrfs_set_opt(ctx->mount_opt, NODATACOW);
+			btrfs_set_opt(ctx->mount_opt, NODATASUM);
+		} else {
+			btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+		}
+		break;
+	case Opt_compress_force:
+	case Opt_compress_force_type:
+		btrfs_set_opt(ctx->mount_opt, FORCE_COMPRESS);
+		fallthrough;
+	case Opt_compress:
+	case Opt_compress_type:
+		if (opt == Opt_compress || opt == Opt_compress_force) {
+			ctx->compress_type = BTRFS_COMPRESS_ZLIB;
+			ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;
+			btrfs_set_opt(ctx->mount_opt, COMPRESS);
+			btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+			btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+		} else if (strncmp(param->string, "zlib", 4) == 0) {
+			ctx->compress_type = BTRFS_COMPRESS_ZLIB;
+			ctx->compress_level =
+				btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB,
+							 param->string + 4);
+			btrfs_set_opt(ctx->mount_opt, COMPRESS);
+			btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+			btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+		} else if (strncmp(param->string, "lzo", 3) == 0) {
+			ctx->compress_type = BTRFS_COMPRESS_LZO;
+			ctx->compress_level = 0;
+			btrfs_set_opt(ctx->mount_opt, COMPRESS);
+			btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+			btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+		} else if (strncmp(param->string, "zstd", 4) == 0) {
+			ctx->compress_type = BTRFS_COMPRESS_ZSTD;
+			ctx->compress_level =
+				btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD,
+							 param->string + 4);
+			btrfs_set_opt(ctx->mount_opt, COMPRESS);
+			btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+			btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+		} else if (strncmp(param->string, "no", 2) == 0) {
+			ctx->compress_level = 0;
+			ctx->compress_type = 0;
+			btrfs_clear_opt(ctx->mount_opt, COMPRESS);
+			btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
+		} else {
+			btrfs_err(NULL, "unrecognized compression value %s",
+				  param->string);
+			return -EINVAL;
+		}
+		break;
+	case Opt_ssd:
+		if (result.negated) {
+			btrfs_set_opt(ctx->mount_opt, NOSSD);
+			btrfs_clear_opt(ctx->mount_opt, SSD);
+			btrfs_clear_opt(ctx->mount_opt, SSD_SPREAD);
+		} else {
+			btrfs_set_opt(ctx->mount_opt, SSD);
+			btrfs_clear_opt(ctx->mount_opt, NOSSD);
+		}
+		break;
+	case Opt_ssd_spread:
+		if (result.negated) {
+			btrfs_clear_opt(ctx->mount_opt, SSD_SPREAD);
+		} else {
+			btrfs_set_opt(ctx->mount_opt, SSD);
+			btrfs_set_opt(ctx->mount_opt, SSD_SPREAD);
+			btrfs_clear_opt(ctx->mount_opt, NOSSD);
+		}
+		break;
+	case Opt_barrier:
+		if (result.negated)
+			btrfs_set_opt(ctx->mount_opt, NOBARRIER);
+		else
+			btrfs_clear_opt(ctx->mount_opt, NOBARRIER);
+		break;
+	case Opt_thread_pool:
+		if (result.uint_32 == 0) {
+			btrfs_err(NULL, "invalid value 0 for thread_pool");
+			return -EINVAL;
+		}
+		ctx->thread_pool_size = result.uint_32;
+		break;
+	case Opt_max_inline:
+		ctx->max_inline = memparse(param->string, NULL);
+		break;
+	case Opt_acl:
+		if (result.negated) {
+			fc->sb_flags &= ~SB_POSIXACL;
+		} else {
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
-			info->sb->s_flags |= SB_POSIXACL;
-			break;
+			fc->sb_flags |= SB_POSIXACL;
 #else
-			btrfs_err(info, "support for ACL not compiled in!");
-			ret = -EINVAL;
-			goto out;
+			btrfs_err(NULL, "support for ACL not compiled in");
+			return -EINVAL;
 #endif
-		case Opt_noacl:
-			info->sb->s_flags &= ~SB_POSIXACL;
-			break;
-		case Opt_notreelog:
-			btrfs_set_and_info(info, NOTREELOG,
-					   "disabling tree log");
-			break;
-		case Opt_treelog:
-			btrfs_clear_and_info(info, NOTREELOG,
-					     "enabling tree log");
-			break;
-		case Opt_norecovery:
-		case Opt_nologreplay:
-			btrfs_warn(info,
+		}
+		/*
+		 * VFS limits the ability to toggle ACL on and off via remount,
+		 * despite every file system allowing this.  This seems to be
+		 * an oversight since we all do, but it'll fail if we're
+		 * remounting.  So don't set the mask here, we'll check it in
+		 * btrfs_reconfigure and do the toggling ourselves.
+		 */
+		if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE)
+			fc->sb_flags_mask |= SB_POSIXACL;
+		break;
+	case Opt_treelog:
+		if (result.negated)
+			btrfs_set_opt(ctx->mount_opt, NOTREELOG);
+		else
+			btrfs_clear_opt(ctx->mount_opt, NOTREELOG);
+		break;
+	case Opt_nologreplay:
+		btrfs_warn(NULL,
 		"'nologreplay' is deprecated, use 'rescue=nologreplay' instead");
-			btrfs_set_and_info(info, NOLOGREPLAY,
-					   "disabling log replay at mount time");
-			break;
-		case Opt_flushoncommit:
-			btrfs_set_and_info(info, FLUSHONCOMMIT,
-					   "turning on flush-on-commit");
-			break;
-		case Opt_noflushoncommit:
-			btrfs_clear_and_info(info, FLUSHONCOMMIT,
-					     "turning off flush-on-commit");
-			break;
-		case Opt_ratio:
-			ret = match_int(&args[0], &intarg);
-			if (ret) {
-				btrfs_err(info, "unrecognized metadata_ratio value %s",
-					  args[0].from);
-				goto out;
-			}
-			info->metadata_ratio = intarg;
-			btrfs_info(info, "metadata ratio %u",
-				   info->metadata_ratio);
-			break;
-		case Opt_discard:
-		case Opt_discard_mode:
-			if (token == Opt_discard ||
-			    strcmp(args[0].from, "sync") == 0) {
-				btrfs_clear_opt(info->mount_opt, DISCARD_ASYNC);
-				btrfs_set_and_info(info, DISCARD_SYNC,
-						   "turning on sync discard");
-			} else if (strcmp(args[0].from, "async") == 0) {
-				btrfs_clear_opt(info->mount_opt, DISCARD_SYNC);
-				btrfs_set_and_info(info, DISCARD_ASYNC,
-						   "turning on async discard");
-			} else {
-				btrfs_err(info, "unrecognized discard mode value %s",
-					  args[0].from);
-				ret = -EINVAL;
-				goto out;
-			}
-			btrfs_clear_opt(info->mount_opt, NODISCARD);
-			break;
-		case Opt_nodiscard:
-			btrfs_clear_and_info(info, DISCARD_SYNC,
-					     "turning off discard");
-			btrfs_clear_and_info(info, DISCARD_ASYNC,
-					     "turning off async discard");
-			btrfs_set_opt(info->mount_opt, NODISCARD);
-			break;
-		case Opt_space_cache:
-		case Opt_space_cache_version:
-			/*
-			 * We already set FREE_SPACE_TREE above because we have
-			 * compat_ro(FREE_SPACE_TREE) set, and we aren't going
-			 * to allow v1 to be set for extent tree v2, simply
-			 * ignore this setting if we're extent tree v2.
-			 */
-			if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
-				break;
-			if (token == Opt_space_cache ||
-			    strcmp(args[0].from, "v1") == 0) {
-				btrfs_clear_opt(info->mount_opt,
-						FREE_SPACE_TREE);
-				btrfs_set_and_info(info, SPACE_CACHE,
-					   "enabling disk space caching");
-			} else if (strcmp(args[0].from, "v2") == 0) {
-				btrfs_clear_opt(info->mount_opt,
-						SPACE_CACHE);
-				btrfs_set_and_info(info, FREE_SPACE_TREE,
-						   "enabling free space tree");
-			} else {
-				btrfs_err(info, "unrecognized space_cache value %s",
-					  args[0].from);
-				ret = -EINVAL;
-				goto out;
-			}
-			break;
-		case Opt_rescan_uuid_tree:
-			btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
-			break;
-		case Opt_no_space_cache:
-			/*
-			 * We cannot operate without the free space tree with
-			 * extent tree v2, ignore this option.
-			 */
-			if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
-				break;
-			if (btrfs_test_opt(info, SPACE_CACHE)) {
-				btrfs_clear_and_info(info, SPACE_CACHE,
-					     "disabling disk space caching");
-			}
-			if (btrfs_test_opt(info, FREE_SPACE_TREE)) {
-				btrfs_clear_and_info(info, FREE_SPACE_TREE,
-					     "disabling free space tree");
-			}
-			break;
-		case Opt_inode_cache:
-		case Opt_noinode_cache:
-			btrfs_warn(info,
-	"the 'inode_cache' option is deprecated and has no effect since 5.11");
+		btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY);
+		break;
+	case Opt_flushoncommit:
+		if (result.negated)
+			btrfs_clear_opt(ctx->mount_opt, FLUSHONCOMMIT);
+		else
+			btrfs_set_opt(ctx->mount_opt, FLUSHONCOMMIT);
+		break;
+	case Opt_ratio:
+		ctx->metadata_ratio = result.uint_32;
+		break;
+	case Opt_discard:
+		if (result.negated) {
+			btrfs_clear_opt(ctx->mount_opt, DISCARD_SYNC);
+			btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC);
+			btrfs_set_opt(ctx->mount_opt, NODISCARD);
+		} else {
+			btrfs_set_opt(ctx->mount_opt, DISCARD_SYNC);
+			btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC);
+		}
+		break;
+	case Opt_discard_mode:
+		switch (result.uint_32) {
+		case Opt_discard_sync:
+			btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC);
+			btrfs_set_opt(ctx->mount_opt, DISCARD_SYNC);
 			break;
-		case Opt_clear_cache:
-			/*
-			 * We cannot clear the free space tree with extent tree
-			 * v2, ignore this option.
-			 */
-			if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
-				break;
-			btrfs_set_and_info(info, CLEAR_CACHE,
-					   "force clearing of disk cache");
+		case Opt_discard_async:
+			btrfs_clear_opt(ctx->mount_opt, DISCARD_SYNC);
+			btrfs_set_opt(ctx->mount_opt, DISCARD_ASYNC);
 			break;
-		case Opt_user_subvol_rm_allowed:
-			btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
+		default:
+			btrfs_err(NULL, "unrecognized discard mode value %s",
+				  param->key);
+			return -EINVAL;
+		}
+		btrfs_clear_opt(ctx->mount_opt, NODISCARD);
+		break;
+	case Opt_space_cache:
+		if (result.negated) {
+			btrfs_set_opt(ctx->mount_opt, NOSPACECACHE);
+			btrfs_clear_opt(ctx->mount_opt, SPACE_CACHE);
+			btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE);
+		} else {
+			btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE);
+			btrfs_set_opt(ctx->mount_opt, SPACE_CACHE);
+		}
+		break;
+	case Opt_space_cache_version:
+		switch (result.uint_32) {
+		case Opt_space_cache_v1:
+			btrfs_set_opt(ctx->mount_opt, SPACE_CACHE);
+			btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE);
 			break;
-		case Opt_enospc_debug:
-			btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
+		case Opt_space_cache_v2:
+			btrfs_clear_opt(ctx->mount_opt, SPACE_CACHE);
+			btrfs_set_opt(ctx->mount_opt, FREE_SPACE_TREE);
 			break;
-		case Opt_noenospc_debug:
-			btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG);
+		default:
+			btrfs_err(NULL, "unrecognized space_cache value %s",
+				  param->key);
+			return -EINVAL;
+		}
+		break;
+	case Opt_rescan_uuid_tree:
+		btrfs_set_opt(ctx->mount_opt, RESCAN_UUID_TREE);
+		break;
+	case Opt_clear_cache:
+		btrfs_set_opt(ctx->mount_opt, CLEAR_CACHE);
+		break;
+	case Opt_user_subvol_rm_allowed:
+		btrfs_set_opt(ctx->mount_opt, USER_SUBVOL_RM_ALLOWED);
+		break;
+	case Opt_enospc_debug:
+		if (result.negated)
+			btrfs_clear_opt(ctx->mount_opt, ENOSPC_DEBUG);
+		else
+			btrfs_set_opt(ctx->mount_opt, ENOSPC_DEBUG);
+		break;
+	case Opt_defrag:
+		if (result.negated)
+			btrfs_clear_opt(ctx->mount_opt, AUTO_DEFRAG);
+		else
+			btrfs_set_opt(ctx->mount_opt, AUTO_DEFRAG);
+		break;
+	case Opt_usebackuproot:
+		btrfs_warn(NULL,
+			   "'usebackuproot' is deprecated, use 'rescue=usebackuproot' instead");
+		btrfs_set_opt(ctx->mount_opt, USEBACKUPROOT);
+
+		/* If we're loading the backup roots we can't trust the space cache. */
+		btrfs_set_opt(ctx->mount_opt, CLEAR_CACHE);
+		break;
+	case Opt_skip_balance:
+		btrfs_set_opt(ctx->mount_opt, SKIP_BALANCE);
+		break;
+	case Opt_fatal_errors:
+		switch (result.uint_32) {
+		case Opt_fatal_errors_panic:
+			btrfs_set_opt(ctx->mount_opt, PANIC_ON_FATAL_ERROR);
 			break;
-		case Opt_defrag:
-			btrfs_set_and_info(info, AUTO_DEFRAG,
-					   "enabling auto defrag");
+		case Opt_fatal_errors_bug:
+			btrfs_clear_opt(ctx->mount_opt, PANIC_ON_FATAL_ERROR);
 			break;
-		case Opt_nodefrag:
-			btrfs_clear_and_info(info, AUTO_DEFRAG,
-					     "disabling auto defrag");
+		default:
+			btrfs_err(NULL, "unrecognized fatal_errors value %s",
+				  param->key);
+			return -EINVAL;
+		}
+		break;
+	case Opt_commit_interval:
+		ctx->commit_interval = result.uint_32;
+		if (ctx->commit_interval == 0)
+			ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
+		break;
+	case Opt_rescue:
+		switch (result.uint_32) {
+		case Opt_rescue_usebackuproot:
+			btrfs_set_opt(ctx->mount_opt, USEBACKUPROOT);
 			break;
-		case Opt_recovery:
-		case Opt_usebackuproot:
-			btrfs_warn(info,
-			"'%s' is deprecated, use 'rescue=usebackuproot' instead",
-				   token == Opt_recovery ? "recovery" :
-				   "usebackuproot");
-			btrfs_info(info,
-				   "trying to use backup root at mount time");
-			btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
+		case Opt_rescue_nologreplay:
+			btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY);
 			break;
-		case Opt_skip_balance:
-			btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
+		case Opt_rescue_ignorebadroots:
+			btrfs_set_opt(ctx->mount_opt, IGNOREBADROOTS);
 			break;
-		case Opt_fatal_errors:
-			if (strcmp(args[0].from, "panic") == 0) {
-				btrfs_set_opt(info->mount_opt,
-					      PANIC_ON_FATAL_ERROR);
-			} else if (strcmp(args[0].from, "bug") == 0) {
-				btrfs_clear_opt(info->mount_opt,
-					      PANIC_ON_FATAL_ERROR);
-			} else {
-				btrfs_err(info, "unrecognized fatal_errors value %s",
-					  args[0].from);
-				ret = -EINVAL;
-				goto out;
-			}
+		case Opt_rescue_ignoredatacsums:
+			btrfs_set_opt(ctx->mount_opt, IGNOREDATACSUMS);
 			break;
-		case Opt_commit_interval:
-			intarg = 0;
-			ret = match_int(&args[0], &intarg);
-			if (ret) {
-				btrfs_err(info, "unrecognized commit_interval value %s",
-					  args[0].from);
-				ret = -EINVAL;
-				goto out;
-			}
-			if (intarg == 0) {
-				btrfs_info(info,
-					   "using default commit interval %us",
-					   BTRFS_DEFAULT_COMMIT_INTERVAL);
-				intarg = BTRFS_DEFAULT_COMMIT_INTERVAL;
-			} else if (intarg > 300) {
-				btrfs_warn(info, "excessive commit interval %d",
-					   intarg);
-			}
-			info->commit_interval = intarg;
-			break;
-		case Opt_rescue:
-			ret = parse_rescue_options(info, args[0].from);
-			if (ret < 0) {
-				btrfs_err(info, "unrecognized rescue value %s",
-					  args[0].from);
-				goto out;
-			}
+		case Opt_rescue_parameter_all:
+			btrfs_set_opt(ctx->mount_opt, IGNOREDATACSUMS);
+			btrfs_set_opt(ctx->mount_opt, IGNOREBADROOTS);
+			btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY);
 			break;
+		default:
+			btrfs_info(NULL, "unrecognized rescue option '%s'",
+				   param->key);
+			return -EINVAL;
+		}
+		break;
 #ifdef CONFIG_BTRFS_DEBUG
-		case Opt_fragment_all:
-			btrfs_info(info, "fragmenting all space");
-			btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
-			btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA);
+	case Opt_fragment:
+		switch (result.uint_32) {
+		case Opt_fragment_parameter_all:
+			btrfs_set_opt(ctx->mount_opt, FRAGMENT_DATA);
+			btrfs_set_opt(ctx->mount_opt, FRAGMENT_METADATA);
 			break;
-		case Opt_fragment_metadata:
-			btrfs_info(info, "fragmenting metadata");
-			btrfs_set_opt(info->mount_opt,
-				      FRAGMENT_METADATA);
+		case Opt_fragment_parameter_metadata:
+			btrfs_set_opt(ctx->mount_opt, FRAGMENT_METADATA);
 			break;
-		case Opt_fragment_data:
-			btrfs_info(info, "fragmenting data");
-			btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
+		case Opt_fragment_parameter_data:
+			btrfs_set_opt(ctx->mount_opt, FRAGMENT_DATA);
 			break;
+		default:
+			btrfs_info(NULL, "unrecognized fragment option '%s'",
+				   param->key);
+			return -EINVAL;
+		}
+		break;
 #endif
 #ifdef CONFIG_BTRFS_FS_REF_VERIFY
-		case Opt_ref_verify:
-			btrfs_info(info, "doing ref verification");
-			btrfs_set_opt(info->mount_opt, REF_VERIFY);
-			break;
+	case Opt_ref_verify:
+		btrfs_set_opt(ctx->mount_opt, REF_VERIFY);
+		break;
 #endif
-		case Opt_err:
-			btrfs_err(info, "unrecognized mount option '%s'", p);
-			ret = -EINVAL;
-			goto out;
-		default:
-			break;
-		}
+	default:
+		btrfs_err(NULL, "unrecognized mount option '%s'", param->key);
+		return -EINVAL;
 	}
-check:
-	/* We're read-only, don't have to check. */
-	if (new_flags & SB_RDONLY)
-		goto out;
 
-	if (check_ro_option(info, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") ||
-	    check_ro_option(info, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") ||
-	    check_ro_option(info, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums"))
-		ret = -EINVAL;
-out:
-	if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) &&
-	    !btrfs_test_opt(info, FREE_SPACE_TREE) &&
-	    !btrfs_test_opt(info, CLEAR_CACHE)) {
-		btrfs_err(info, "cannot disable free space tree");
-		ret = -EINVAL;
-	}
-	if (btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE) &&
-	     !btrfs_test_opt(info, FREE_SPACE_TREE)) {
-		btrfs_err(info, "cannot disable free space tree with block-group-tree feature");
-		ret = -EINVAL;
-	}
-	if (!ret)
-		ret = btrfs_check_mountopts_zoned(info);
-	if (!ret && !remounting) {
-		if (btrfs_test_opt(info, SPACE_CACHE))
-			btrfs_info(info, "disk space caching is enabled");
-		if (btrfs_test_opt(info, FREE_SPACE_TREE))
-			btrfs_info(info, "using free space tree");
-	}
-	return ret;
+	return 0;
 }
 
 /*
- * Parse mount options that are required early in the mount process.
- *
- * All other options will be parsed on much later in the mount process and
- * only when we need to allocate a new super block.
+ * Some options only have meaning at mount time and shouldn't persist across
+ * remounts, or be displayed. Clear these at the end of mount and remount code
+ * paths.
  */
-static int btrfs_parse_device_options(const char *options, blk_mode_t flags)
+static void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info)
 {
-	substring_t args[MAX_OPT_ARGS];
-	char *device_name, *opts, *orig, *p;
-	struct btrfs_device *device = NULL;
-	int error = 0;
+	btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
+	btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE);
+	btrfs_clear_opt(fs_info->mount_opt, NOSPACECACHE);
+}
 
-	lockdep_assert_held(&uuid_mutex);
+static bool check_ro_option(struct btrfs_fs_info *fs_info,
+			    unsigned long mount_opt, unsigned long opt,
+			    const char *opt_name)
+{
+	if (mount_opt & opt) {
+		btrfs_err(fs_info, "%s must be used with ro mount option",
+			  opt_name);
+		return true;
+	}
+	return false;
+}
 
-	if (!options)
-		return 0;
+bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt,
+			 unsigned long flags)
+{
+	bool ret = true;
 
-	/*
-	 * strsep changes the string, duplicate it because btrfs_parse_options
-	 * gets called later
-	 */
-	opts = kstrdup(options, GFP_KERNEL);
-	if (!opts)
-		return -ENOMEM;
-	orig = opts;
+	if (!(flags & SB_RDONLY) &&
+	    (check_ro_option(info, *mount_opt, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") ||
+	     check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") ||
+	     check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums")))
+		ret = false;
 
-	while ((p = strsep(&opts, ",")) != NULL) {
-		int token;
+	if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) &&
+	    !btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE) &&
+	    !btrfs_raw_test_opt(*mount_opt, CLEAR_CACHE)) {
+		btrfs_err(info, "cannot disable free-space-tree");
+		ret = false;
+	}
+	if (btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE) &&
+	     !btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE)) {
+		btrfs_err(info, "cannot disable free-space-tree with block-group-tree feature");
+		ret = false;
+	}
 
-		if (!*p)
-			continue;
+	if (btrfs_check_mountopts_zoned(info, mount_opt))
+		ret = false;
 
-		token = match_token(p, tokens, args);
-		if (token == Opt_device) {
-			device_name = match_strdup(&args[0]);
-			if (!device_name) {
-				error = -ENOMEM;
-				goto out;
-			}
-			device = btrfs_scan_one_device(device_name, flags, false);
-			kfree(device_name);
-			if (IS_ERR(device)) {
-				error = PTR_ERR(device);
-				goto out;
-			}
-		}
+	if (!test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state)) {
+		if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE))
+			btrfs_info(info, "disk space caching is enabled");
+		if (btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE))
+			btrfs_info(info, "using free-space-tree");
 	}
 
-out:
-	kfree(orig);
-	return error;
+	return ret;
 }
 
 /*
- * Parse mount options that are related to subvolume id
+ * This is subtle, we only call this during open_ctree().  We need to pre-load
+ * the mount options with the on-disk settings.  Before the new mount API took
+ * effect we would do this on mount and remount.  With the new mount API we'll
+ * only do this on the initial mount.
  *
- * The value is later passed to mount_subvol()
+ * This isn't a change in behavior, because we're using the current state of the
+ * file system to set the current mount options.  If you mounted with special
+ * options to disable these features and then remounted we wouldn't revert the
+ * settings, because mounting without these features cleared the on-disk
+ * settings, so this being called on re-mount is not needed.
  */
-static int btrfs_parse_subvol_options(const char *options, char **subvol_name,
-		u64 *subvol_objectid)
+void btrfs_set_free_space_cache_settings(struct btrfs_fs_info *fs_info)
 {
-	substring_t args[MAX_OPT_ARGS];
-	char *opts, *orig, *p;
-	int error = 0;
-	u64 subvolid;
-
-	if (!options)
-		return 0;
+	if (fs_info->sectorsize < PAGE_SIZE) {
+		btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
+		if (!btrfs_test_opt(fs_info, FREE_SPACE_TREE)) {
+			btrfs_info(fs_info,
+				   "forcing free space tree for sector size %u with page size %lu",
+				   fs_info->sectorsize, PAGE_SIZE);
+			btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
+		}
+	}
 
 	/*
-	 * strsep changes the string, duplicate it because
-	 * btrfs_parse_device_options gets called later
+	 * At this point our mount options are populated, so we only mess with
+	 * these settings if we don't have any settings already.
 	 */
-	opts = kstrdup(options, GFP_KERNEL);
-	if (!opts)
-		return -ENOMEM;
-	orig = opts;
+	if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
+		return;
 
-	while ((p = strsep(&opts, ",")) != NULL) {
-		int token;
-		if (!*p)
-			continue;
+	if (btrfs_is_zoned(fs_info) &&
+	    btrfs_free_space_cache_v1_active(fs_info)) {
+		btrfs_info(fs_info, "zoned: clearing existing space cache");
+		btrfs_set_super_cache_generation(fs_info->super_copy, 0);
+		return;
+	}
 
-		token = match_token(p, tokens, args);
-		switch (token) {
-		case Opt_subvol:
-			kfree(*subvol_name);
-			*subvol_name = match_strdup(&args[0]);
-			if (!*subvol_name) {
-				error = -ENOMEM;
-				goto out;
-			}
-			break;
-		case Opt_subvolid:
-			error = match_u64(&args[0], &subvolid);
-			if (error)
-				goto out;
+	if (btrfs_test_opt(fs_info, SPACE_CACHE))
+		return;
 
-			/* we want the original fs_tree */
-			if (subvolid == 0)
-				subvolid = BTRFS_FS_TREE_OBJECTID;
+	if (btrfs_test_opt(fs_info, NOSPACECACHE))
+		return;
 
-			*subvol_objectid = subvolid;
-			break;
-		default:
-			break;
-		}
-	}
+	/*
+	 * At this point we don't have explicit options set by the user, set
+	 * them ourselves based on the state of the file system.
+	 */
+	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+		btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
+	else if (btrfs_free_space_cache_v1_active(fs_info))
+		btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE);
+}
 
-out:
-	kfree(orig);
-	return error;
+static void set_device_specific_options(struct btrfs_fs_info *fs_info)
+{
+	if (!btrfs_test_opt(fs_info, NOSSD) &&
+	    !fs_info->fs_devices->rotating)
+		btrfs_set_opt(fs_info->mount_opt, SSD);
+
+	/*
+	 * For devices supporting discard turn on discard=async automatically,
+	 * unless it's already set or disabled. This could be turned off by
+	 * nodiscard for the same mount.
+	 *
+	 * The zoned mode piggy backs on the discard functionality for
+	 * resetting a zone. There is no reason to delay the zone reset as it is
+	 * fast enough. So, do not enable async discard for zoned mode.
+	 */
+	if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) ||
+	      btrfs_test_opt(fs_info, DISCARD_ASYNC) ||
+	      btrfs_test_opt(fs_info, NODISCARD)) &&
+	    fs_info->fs_devices->discardable &&
+	    !btrfs_is_zoned(fs_info))
+		btrfs_set_opt(fs_info->mount_opt, DISCARD_ASYNC);
 }
 
 char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
@@ -1105,10 +929,6 @@ static int btrfs_fill_super(struct super_block *sb,
 #endif
 	sb->s_xattr = btrfs_xattr_handlers;
 	sb->s_time_gran = 1;
-#ifdef CONFIG_BTRFS_FS_POSIX_ACL
-	sb->s_flags |= SB_POSIXACL;
-#endif
-	sb->s_flags |= SB_I_VERSION;
 	sb->s_iflags |= SB_I_CGROUPWB;
 
 	err = super_setup_bdi(sb);
@@ -1291,22 +1111,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 	return 0;
 }
 
-static int btrfs_test_super(struct super_block *s, void *data)
-{
-	struct btrfs_fs_info *p = data;
-	struct btrfs_fs_info *fs_info = btrfs_sb(s);
-
-	return fs_info->fs_devices == p->fs_devices;
-}
-
-static int btrfs_set_super(struct super_block *s, void *data)
-{
-	int err = set_anon_super(s, data);
-	if (!err)
-		s->s_fs_info = data;
-	return err;
-}
-
 /*
  * subvolumes are identified by ino 256
  */
@@ -1382,202 +1186,6 @@ out:
 	return root;
 }
 
-/*
- * Find a superblock for the given device / mount point.
- *
- * Note: This is based on mount_bdev from fs/super.c with a few additions
- *       for multiple device setup.  Make sure to keep it in sync.
- */
-static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
-		int flags, const char *device_name, void *data)
-{
-	struct block_device *bdev = NULL;
-	struct super_block *s;
-	struct btrfs_device *device = NULL;
-	struct btrfs_fs_devices *fs_devices = NULL;
-	struct btrfs_fs_info *fs_info = NULL;
-	void *new_sec_opts = NULL;
-	blk_mode_t mode = sb_open_mode(flags);
-	int error = 0;
-
-	if (data) {
-		error = security_sb_eat_lsm_opts(data, &new_sec_opts);
-		if (error)
-			return ERR_PTR(error);
-	}
-
-	/* No support for restricting writes to btrfs devices yet... */
-	mode &= ~BLK_OPEN_RESTRICT_WRITES;
-	/*
-	 * Setup a dummy root and fs_info for test/set super.  This is because
-	 * we don't actually fill this stuff out until open_ctree, but we need
-	 * then open_ctree will properly initialize the file system specific
-	 * settings later.  btrfs_init_fs_info initializes the static elements
-	 * of the fs_info (locks and such) to make cleanup easier if we find a
-	 * superblock with our given fs_devices later on at sget() time.
-	 */
-	fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL);
-	if (!fs_info) {
-		error = -ENOMEM;
-		goto error_sec_opts;
-	}
-	btrfs_init_fs_info(fs_info);
-
-	fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
-	fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
-	if (!fs_info->super_copy || !fs_info->super_for_commit) {
-		error = -ENOMEM;
-		goto error_fs_info;
-	}
-
-	mutex_lock(&uuid_mutex);
-	error = btrfs_parse_device_options(data, mode);
-	if (error) {
-		mutex_unlock(&uuid_mutex);
-		goto error_fs_info;
-	}
-
-	/*
-	 * With 'true' passed to btrfs_scan_one_device() (mount time) we expect
-	 * either a valid device or an error.
-	 */
-	device = btrfs_scan_one_device(device_name, mode, true);
-	ASSERT(device != NULL);
-	if (IS_ERR(device)) {
-		mutex_unlock(&uuid_mutex);
-		error = PTR_ERR(device);
-		goto error_fs_info;
-	}
-
-	fs_devices = device->fs_devices;
-	fs_info->fs_devices = fs_devices;
-
-	error = btrfs_open_devices(fs_devices, mode, fs_type);
-	mutex_unlock(&uuid_mutex);
-	if (error)
-		goto error_fs_info;
-
-	if (!(flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
-		error = -EACCES;
-		goto error_close_devices;
-	}
-
-	bdev = fs_devices->latest_dev->bdev;
-	s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC,
-		 fs_info);
-	if (IS_ERR(s)) {
-		error = PTR_ERR(s);
-		goto error_close_devices;
-	}
-
-	if (s->s_root) {
-		btrfs_close_devices(fs_devices);
-		btrfs_free_fs_info(fs_info);
-		if ((flags ^ s->s_flags) & SB_RDONLY)
-			error = -EBUSY;
-	} else {
-		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
-		shrinker_debugfs_rename(s->s_shrink, "sb-%s:%s", fs_type->name,
-					s->s_id);
-		btrfs_sb(s)->bdev_holder = fs_type;
-		error = btrfs_fill_super(s, fs_devices, data);
-	}
-	if (!error)
-		error = security_sb_set_mnt_opts(s, new_sec_opts, 0, NULL);
-	security_free_mnt_opts(&new_sec_opts);
-	if (error) {
-		deactivate_locked_super(s);
-		return ERR_PTR(error);
-	}
-
-	return dget(s->s_root);
-
-error_close_devices:
-	btrfs_close_devices(fs_devices);
-error_fs_info:
-	btrfs_free_fs_info(fs_info);
-error_sec_opts:
-	security_free_mnt_opts(&new_sec_opts);
-	return ERR_PTR(error);
-}
-
-/*
- * Mount function which is called by VFS layer.
- *
- * In order to allow mounting a subvolume directly, btrfs uses mount_subtree()
- * which needs vfsmount* of device's root (/).  This means device's root has to
- * be mounted internally in any case.
- *
- * Operation flow:
- *   1. Parse subvol id related options for later use in mount_subvol().
- *
- *   2. Mount device's root (/) by calling vfs_kern_mount().
- *
- *      NOTE: vfs_kern_mount() is used by VFS to call btrfs_mount() in the
- *      first place. In order to avoid calling btrfs_mount() again, we use
- *      different file_system_type which is not registered to VFS by
- *      register_filesystem() (btrfs_root_fs_type). As a result,
- *      btrfs_mount_root() is called. The return value will be used by
- *      mount_subtree() in mount_subvol().
- *
- *   3. Call mount_subvol() to get the dentry of subvolume. Since there is
- *      "btrfs subvolume set-default", mount_subvol() is called always.
- */
-static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
-		const char *device_name, void *data)
-{
-	struct vfsmount *mnt_root;
-	struct dentry *root;
-	char *subvol_name = NULL;
-	u64 subvol_objectid = 0;
-	int error = 0;
-
-	error = btrfs_parse_subvol_options(data, &subvol_name,
-					&subvol_objectid);
-	if (error) {
-		kfree(subvol_name);
-		return ERR_PTR(error);
-	}
-
-	/* mount device's root (/) */
-	mnt_root = vfs_kern_mount(&btrfs_root_fs_type, flags, device_name, data);
-	if (PTR_ERR_OR_ZERO(mnt_root) == -EBUSY) {
-		if (flags & SB_RDONLY) {
-			mnt_root = vfs_kern_mount(&btrfs_root_fs_type,
-				flags & ~SB_RDONLY, device_name, data);
-		} else {
-			mnt_root = vfs_kern_mount(&btrfs_root_fs_type,
-				flags | SB_RDONLY, device_name, data);
-			if (IS_ERR(mnt_root)) {
-				root = ERR_CAST(mnt_root);
-				kfree(subvol_name);
-				goto out;
-			}
-
-			down_write(&mnt_root->mnt_sb->s_umount);
-			error = btrfs_remount(mnt_root->mnt_sb, &flags, NULL);
-			up_write(&mnt_root->mnt_sb->s_umount);
-			if (error < 0) {
-				root = ERR_PTR(error);
-				mntput(mnt_root);
-				kfree(subvol_name);
-				goto out;
-			}
-		}
-	}
-	if (IS_ERR(mnt_root)) {
-		root = ERR_CAST(mnt_root);
-		kfree(subvol_name);
-		goto out;
-	}
-
-	/* mount_subvol() will free subvol_name and mnt_root */
-	root = mount_subvol(subvol_name, subvol_objectid, mnt_root);
-
-out:
-	return root;
-}
-
 static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
 				     u32 new_pool_size, u32 old_pool_size)
 {
@@ -1640,202 +1248,274 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
 		btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
 }
 
-static int btrfs_remount(struct super_block *sb, int *flags, char *data)
+static int btrfs_remount_rw(struct btrfs_fs_info *fs_info)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
-	unsigned old_flags = sb->s_flags;
-	unsigned long old_opts = fs_info->mount_opt;
-	unsigned long old_compress_type = fs_info->compress_type;
-	u64 old_max_inline = fs_info->max_inline;
-	u32 old_thread_pool_size = fs_info->thread_pool_size;
-	u32 old_metadata_ratio = fs_info->metadata_ratio;
 	int ret;
 
-	sync_filesystem(sb);
-	set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+	if (BTRFS_FS_ERROR(fs_info)) {
+		btrfs_err(fs_info,
+			  "remounting read-write after error is not allowed");
+		return -EINVAL;
+	}
 
-	if (data) {
-		void *new_sec_opts = NULL;
+	if (fs_info->fs_devices->rw_devices == 0)
+		return -EACCES;
 
-		ret = security_sb_eat_lsm_opts(data, &new_sec_opts);
-		if (!ret)
-			ret = security_sb_remount(sb, new_sec_opts);
-		security_free_mnt_opts(&new_sec_opts);
-		if (ret)
-			goto restore;
+	if (!btrfs_check_rw_degradable(fs_info, NULL)) {
+		btrfs_warn(fs_info,
+			   "too many missing devices, writable remount is not allowed");
+		return -EACCES;
+	}
+
+	if (btrfs_super_log_root(fs_info->super_copy) != 0) {
+		btrfs_warn(fs_info,
+			   "mount required to replay tree-log, cannot remount read-write");
+		return -EINVAL;
 	}
 
-	ret = btrfs_parse_options(fs_info, data, *flags);
+	/*
+	 * NOTE: when remounting with a change that does writes, don't put it
+	 * anywhere above this point, as we are not sure to be safe to write
+	 * until we pass the above checks.
+	 */
+	ret = btrfs_start_pre_rw_mount(fs_info);
 	if (ret)
-		goto restore;
+		return ret;
 
-	ret = btrfs_check_features(fs_info, !(*flags & SB_RDONLY));
-	if (ret < 0)
-		goto restore;
+	btrfs_clear_sb_rdonly(fs_info->sb);
 
-	btrfs_remount_begin(fs_info, old_opts, *flags);
-	btrfs_resize_thread_pool(fs_info,
-		fs_info->thread_pool_size, old_thread_pool_size);
+	set_bit(BTRFS_FS_OPEN, &fs_info->flags);
 
-	if ((bool)btrfs_test_opt(fs_info, FREE_SPACE_TREE) !=
-	    (bool)btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
-	    (!sb_rdonly(sb) || (*flags & SB_RDONLY))) {
-		btrfs_warn(fs_info,
-		"remount supports changing free space tree only from ro to rw");
-		/* Make sure free space cache options match the state on disk */
-		if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
-			btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
-			btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
-		}
-		if (btrfs_free_space_cache_v1_active(fs_info)) {
-			btrfs_clear_opt(fs_info->mount_opt, FREE_SPACE_TREE);
-			btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE);
-		}
-	}
+	/*
+	 * If we've gone from readonly -> read-write, we need to get our
+	 * sync/async discard lists in the right state.
+	 */
+	btrfs_discard_resume(fs_info);
 
-	if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
-		goto out;
+	return 0;
+}
 
-	if (*flags & SB_RDONLY) {
-		/*
-		 * this also happens on 'umount -rf' or on shutdown, when
-		 * the filesystem is busy.
-		 */
-		cancel_work_sync(&fs_info->async_reclaim_work);
-		cancel_work_sync(&fs_info->async_data_reclaim_work);
+static int btrfs_remount_ro(struct btrfs_fs_info *fs_info)
+{
+	/*
+	 * This also happens on 'umount -rf' or on shutdown, when the
+	 * filesystem is busy.
+	 */
+	cancel_work_sync(&fs_info->async_reclaim_work);
+	cancel_work_sync(&fs_info->async_data_reclaim_work);
 
-		btrfs_discard_cleanup(fs_info);
+	btrfs_discard_cleanup(fs_info);
 
-		/* wait for the uuid_scan task to finish */
-		down(&fs_info->uuid_tree_rescan_sem);
-		/* avoid complains from lockdep et al. */
-		up(&fs_info->uuid_tree_rescan_sem);
+	/* Wait for the uuid_scan task to finish */
+	down(&fs_info->uuid_tree_rescan_sem);
+	/* Avoid complains from lockdep et al. */
+	up(&fs_info->uuid_tree_rescan_sem);
 
-		btrfs_set_sb_rdonly(sb);
+	btrfs_set_sb_rdonly(fs_info->sb);
 
-		/*
-		 * Setting SB_RDONLY will put the cleaner thread to
-		 * sleep at the next loop if it's already active.
-		 * If it's already asleep, we'll leave unused block
-		 * groups on disk until we're mounted read-write again
-		 * unless we clean them up here.
-		 */
-		btrfs_delete_unused_bgs(fs_info);
+	/*
+	 * Setting SB_RDONLY will put the cleaner thread to sleep at the next
+	 * loop if it's already active.  If it's already asleep, we'll leave
+	 * unused block groups on disk until we're mounted read-write again
+	 * unless we clean them up here.
+	 */
+	btrfs_delete_unused_bgs(fs_info);
 
-		/*
-		 * The cleaner task could be already running before we set the
-		 * flag BTRFS_FS_STATE_RO (and SB_RDONLY in the superblock).
-		 * We must make sure that after we finish the remount, i.e. after
-		 * we call btrfs_commit_super(), the cleaner can no longer start
-		 * a transaction - either because it was dropping a dead root,
-		 * running delayed iputs or deleting an unused block group (the
-		 * cleaner picked a block group from the list of unused block
-		 * groups before we were able to in the previous call to
-		 * btrfs_delete_unused_bgs()).
-		 */
-		wait_on_bit(&fs_info->flags, BTRFS_FS_CLEANER_RUNNING,
-			    TASK_UNINTERRUPTIBLE);
+	/*
+	 * The cleaner task could be already running before we set the flag
+	 * BTRFS_FS_STATE_RO (and SB_RDONLY in the superblock).  We must make
+	 * sure that after we finish the remount, i.e. after we call
+	 * btrfs_commit_super(), the cleaner can no longer start a transaction
+	 * - either because it was dropping a dead root, running delayed iputs
+	 *   or deleting an unused block group (the cleaner picked a block
+	 *   group from the list of unused block groups before we were able to
+	 *   in the previous call to btrfs_delete_unused_bgs()).
+	 */
+	wait_on_bit(&fs_info->flags, BTRFS_FS_CLEANER_RUNNING, TASK_UNINTERRUPTIBLE);
 
-		/*
-		 * We've set the superblock to RO mode, so we might have made
-		 * the cleaner task sleep without running all pending delayed
-		 * iputs. Go through all the delayed iputs here, so that if an
-		 * unmount happens without remounting RW we don't end up at
-		 * finishing close_ctree() with a non-empty list of delayed
-		 * iputs.
-		 */
-		btrfs_run_delayed_iputs(fs_info);
+	/*
+	 * We've set the superblock to RO mode, so we might have made the
+	 * cleaner task sleep without running all pending delayed iputs. Go
+	 * through all the delayed iputs here, so that if an unmount happens
+	 * without remounting RW we don't end up at finishing close_ctree()
+	 * with a non-empty list of delayed iputs.
+	 */
+	btrfs_run_delayed_iputs(fs_info);
 
-		btrfs_dev_replace_suspend_for_unmount(fs_info);
-		btrfs_scrub_cancel(fs_info);
-		btrfs_pause_balance(fs_info);
+	btrfs_dev_replace_suspend_for_unmount(fs_info);
+	btrfs_scrub_cancel(fs_info);
+	btrfs_pause_balance(fs_info);
 
-		/*
-		 * Pause the qgroup rescan worker if it is running. We don't want
-		 * it to be still running after we are in RO mode, as after that,
-		 * by the time we unmount, it might have left a transaction open,
-		 * so we would leak the transaction and/or crash.
-		 */
-		btrfs_qgroup_wait_for_completion(fs_info, false);
+	/*
+	 * Pause the qgroup rescan worker if it is running. We don't want it to
+	 * be still running after we are in RO mode, as after that, by the time
+	 * we unmount, it might have left a transaction open, so we would leak
+	 * the transaction and/or crash.
+	 */
+	btrfs_qgroup_wait_for_completion(fs_info, false);
 
-		ret = btrfs_commit_super(fs_info);
-		if (ret)
-			goto restore;
-	} else {
-		if (BTRFS_FS_ERROR(fs_info)) {
-			btrfs_err(fs_info,
-				"Remounting read-write after error is not allowed");
-			ret = -EINVAL;
-			goto restore;
-		}
-		if (fs_info->fs_devices->rw_devices == 0) {
-			ret = -EACCES;
-			goto restore;
-		}
+	return btrfs_commit_super(fs_info);
+}
 
-		if (!btrfs_check_rw_degradable(fs_info, NULL)) {
-			btrfs_warn(fs_info,
-		"too many missing devices, writable remount is not allowed");
-			ret = -EACCES;
-			goto restore;
-		}
+static void btrfs_ctx_to_info(struct btrfs_fs_info *fs_info, struct btrfs_fs_context *ctx)
+{
+	fs_info->max_inline = ctx->max_inline;
+	fs_info->commit_interval = ctx->commit_interval;
+	fs_info->metadata_ratio = ctx->metadata_ratio;
+	fs_info->thread_pool_size = ctx->thread_pool_size;
+	fs_info->mount_opt = ctx->mount_opt;
+	fs_info->compress_type = ctx->compress_type;
+	fs_info->compress_level = ctx->compress_level;
+}
 
-		if (btrfs_super_log_root(fs_info->super_copy) != 0) {
-			btrfs_warn(fs_info,
-		"mount required to replay tree-log, cannot remount read-write");
-			ret = -EINVAL;
-			goto restore;
-		}
+static void btrfs_info_to_ctx(struct btrfs_fs_info *fs_info, struct btrfs_fs_context *ctx)
+{
+	ctx->max_inline = fs_info->max_inline;
+	ctx->commit_interval = fs_info->commit_interval;
+	ctx->metadata_ratio = fs_info->metadata_ratio;
+	ctx->thread_pool_size = fs_info->thread_pool_size;
+	ctx->mount_opt = fs_info->mount_opt;
+	ctx->compress_type = fs_info->compress_type;
+	ctx->compress_level = fs_info->compress_level;
+}
 
-		/*
-		 * NOTE: when remounting with a change that does writes, don't
-		 * put it anywhere above this point, as we are not sure to be
-		 * safe to write until we pass the above checks.
-		 */
-		ret = btrfs_start_pre_rw_mount(fs_info);
-		if (ret)
-			goto restore;
+#define btrfs_info_if_set(fs_info, old_ctx, opt, fmt, args...)			\
+do {										\
+	if ((!old_ctx || !btrfs_raw_test_opt(old_ctx->mount_opt, opt)) &&	\
+	    btrfs_raw_test_opt(fs_info->mount_opt, opt))			\
+		btrfs_info(fs_info, fmt, ##args);				\
+} while (0)
+
+#define btrfs_info_if_unset(fs_info, old_ctx, opt, fmt, args...)	\
+do {									\
+	if ((old_ctx && btrfs_raw_test_opt(old_ctx->mount_opt, opt)) &&	\
+	    !btrfs_raw_test_opt(fs_info->mount_opt, opt))		\
+		btrfs_info(fs_info, fmt, ##args);			\
+} while (0)
+
+static void btrfs_emit_options(struct btrfs_fs_info *info,
+			       struct btrfs_fs_context *old)
+{
+	btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum");
+	btrfs_info_if_set(info, old, DEGRADED, "allowing degraded mounts");
+	btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum");
+	btrfs_info_if_set(info, old, SSD, "enabling ssd optimizations");
+	btrfs_info_if_set(info, old, SSD_SPREAD, "using spread ssd allocation scheme");
+	btrfs_info_if_set(info, old, NOBARRIER, "turning off barriers");
+	btrfs_info_if_set(info, old, NOTREELOG, "disabling tree log");
+	btrfs_info_if_set(info, old, NOLOGREPLAY, "disabling log replay at mount time");
+	btrfs_info_if_set(info, old, FLUSHONCOMMIT, "turning on flush-on-commit");
+	btrfs_info_if_set(info, old, DISCARD_SYNC, "turning on sync discard");
+	btrfs_info_if_set(info, old, DISCARD_ASYNC, "turning on async discard");
+	btrfs_info_if_set(info, old, FREE_SPACE_TREE, "enabling free space tree");
+	btrfs_info_if_set(info, old, SPACE_CACHE, "enabling disk space caching");
+	btrfs_info_if_set(info, old, CLEAR_CACHE, "force clearing of disk cache");
+	btrfs_info_if_set(info, old, AUTO_DEFRAG, "enabling auto defrag");
+	btrfs_info_if_set(info, old, FRAGMENT_DATA, "fragmenting data");
+	btrfs_info_if_set(info, old, FRAGMENT_METADATA, "fragmenting metadata");
+	btrfs_info_if_set(info, old, REF_VERIFY, "doing ref verification");
+	btrfs_info_if_set(info, old, USEBACKUPROOT, "trying to use backup root at mount time");
+	btrfs_info_if_set(info, old, IGNOREBADROOTS, "ignoring bad roots");
+	btrfs_info_if_set(info, old, IGNOREDATACSUMS, "ignoring data csums");
+
+	btrfs_info_if_unset(info, old, NODATACOW, "setting datacow");
+	btrfs_info_if_unset(info, old, SSD, "not using ssd optimizations");
+	btrfs_info_if_unset(info, old, SSD_SPREAD, "not using spread ssd allocation scheme");
+	btrfs_info_if_unset(info, old, NOBARRIER, "turning off barriers");
+	btrfs_info_if_unset(info, old, NOTREELOG, "enabling tree log");
+	btrfs_info_if_unset(info, old, SPACE_CACHE, "disabling disk space caching");
+	btrfs_info_if_unset(info, old, FREE_SPACE_TREE, "disabling free space tree");
+	btrfs_info_if_unset(info, old, AUTO_DEFRAG, "disabling auto defrag");
+	btrfs_info_if_unset(info, old, COMPRESS, "use no compression");
+
+	/* Did the compression settings change? */
+	if (btrfs_test_opt(info, COMPRESS) &&
+	    (!old ||
+	     old->compress_type != info->compress_type ||
+	     old->compress_level != info->compress_level ||
+	     (!btrfs_raw_test_opt(old->mount_opt, FORCE_COMPRESS) &&
+	      btrfs_raw_test_opt(info->mount_opt, FORCE_COMPRESS)))) {
+		const char *compress_type = btrfs_compress_type2str(info->compress_type);
+
+		btrfs_info(info, "%s %s compression, level %d",
+			   btrfs_test_opt(info, FORCE_COMPRESS) ? "force" : "use",
+			   compress_type, info->compress_level);
+	}
 
-		btrfs_clear_sb_rdonly(sb);
+	if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE)
+		btrfs_info(info, "max_inline set to %llu", info->max_inline);
+}
 
-		set_bit(BTRFS_FS_OPEN, &fs_info->flags);
+static int btrfs_reconfigure(struct fs_context *fc)
+{
+	struct super_block *sb = fc->root->d_sb;
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_fs_context *ctx = fc->fs_private;
+	struct btrfs_fs_context old_ctx;
+	int ret = 0;
+	bool mount_reconfigure = (fc->s_fs_info != NULL);
 
-		/*
-		 * If we've gone from readonly -> read/write, we need to get
-		 * our sync/async discard lists in the right state.
-		 */
-		btrfs_discard_resume(fs_info);
+	btrfs_info_to_ctx(fs_info, &old_ctx);
+
+	sync_filesystem(sb);
+	set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+
+	if (!mount_reconfigure &&
+	    !btrfs_check_options(fs_info, &ctx->mount_opt, fc->sb_flags))
+		return -EINVAL;
+
+	ret = btrfs_check_features(fs_info, !(fc->sb_flags & SB_RDONLY));
+	if (ret < 0)
+		return ret;
+
+	btrfs_ctx_to_info(fs_info, ctx);
+	btrfs_remount_begin(fs_info, old_ctx.mount_opt, fc->sb_flags);
+	btrfs_resize_thread_pool(fs_info, fs_info->thread_pool_size,
+				 old_ctx.thread_pool_size);
+
+	if ((bool)btrfs_test_opt(fs_info, FREE_SPACE_TREE) !=
+	    (bool)btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
+	    (!sb_rdonly(sb) || (fc->sb_flags & SB_RDONLY))) {
+		btrfs_warn(fs_info,
+		"remount supports changing free space tree only from RO to RW");
+		/* Make sure free space cache options match the state on disk. */
+		if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+			btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
+			btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
+		}
+		if (btrfs_free_space_cache_v1_active(fs_info)) {
+			btrfs_clear_opt(fs_info->mount_opt, FREE_SPACE_TREE);
+			btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE);
+		}
 	}
-out:
+
+	ret = 0;
+	if (!sb_rdonly(sb) && (fc->sb_flags & SB_RDONLY))
+		ret = btrfs_remount_ro(fs_info);
+	else if (sb_rdonly(sb) && !(fc->sb_flags & SB_RDONLY))
+		ret = btrfs_remount_rw(fs_info);
+	if (ret)
+		goto restore;
+
 	/*
-	 * We need to set SB_I_VERSION here otherwise it'll get cleared by VFS,
-	 * since the absence of the flag means it can be toggled off by remount.
+	 * If we set the mask during the parameter parsing VFS would reject the
+	 * remount.  Here we can set the mask and the value will be updated
+	 * appropriately.
 	 */
-	*flags |= SB_I_VERSION;
+	if ((fc->sb_flags & SB_POSIXACL) != (sb->s_flags & SB_POSIXACL))
+		fc->sb_flags_mask |= SB_POSIXACL;
 
+	btrfs_emit_options(fs_info, &old_ctx);
 	wake_up_process(fs_info->transaction_kthread);
-	btrfs_remount_cleanup(fs_info, old_opts);
+	btrfs_remount_cleanup(fs_info, old_ctx.mount_opt);
 	btrfs_clear_oneshot_options(fs_info);
 	clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
 
 	return 0;
-
 restore:
-	/* We've hit an error - don't reset SB_RDONLY */
-	if (sb_rdonly(sb))
-		old_flags |= SB_RDONLY;
-	if (!(old_flags & SB_RDONLY))
-		clear_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state);
-	sb->s_flags = old_flags;
-	fs_info->mount_opt = old_opts;
-	fs_info->compress_type = old_compress_type;
-	fs_info->max_inline = old_max_inline;
-	btrfs_resize_thread_pool(fs_info,
-		old_thread_pool_size, fs_info->thread_pool_size);
-	fs_info->metadata_ratio = old_metadata_ratio;
-	btrfs_remount_cleanup(fs_info, old_opts);
+	btrfs_ctx_to_info(fs_info, &old_ctx);
+	btrfs_remount_cleanup(fs_info, old_ctx.mount_opt);
 	clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
-
 	return ret;
 }
 
@@ -2096,6 +1776,309 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
+static int btrfs_fc_test_super(struct super_block *sb, struct fs_context *fc)
+{
+	struct btrfs_fs_info *p = fc->s_fs_info;
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+
+	return fs_info->fs_devices == p->fs_devices;
+}
+
+static int btrfs_get_tree_super(struct fs_context *fc)
+{
+	struct btrfs_fs_info *fs_info = fc->s_fs_info;
+	struct btrfs_fs_context *ctx = fc->fs_private;
+	struct btrfs_fs_devices *fs_devices = NULL;
+	struct block_device *bdev;
+	struct btrfs_device *device;
+	struct super_block *sb;
+	blk_mode_t mode = btrfs_open_mode(fc);
+	int ret;
+
+	btrfs_ctx_to_info(fs_info, ctx);
+	mutex_lock(&uuid_mutex);
+
+	/*
+	 * With 'true' passed to btrfs_scan_one_device() (mount time) we expect
+	 * either a valid device or an error.
+	 */
+	device = btrfs_scan_one_device(fc->source, mode, true);
+	ASSERT(device != NULL);
+	if (IS_ERR(device)) {
+		mutex_unlock(&uuid_mutex);
+		return PTR_ERR(device);
+	}
+
+	fs_devices = device->fs_devices;
+	fs_info->fs_devices = fs_devices;
+
+	ret = btrfs_open_devices(fs_devices, mode, &btrfs_fs_type);
+	mutex_unlock(&uuid_mutex);
+	if (ret)
+		return ret;
+
+	if (!(fc->sb_flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
+		ret = -EACCES;
+		goto error;
+	}
+
+	bdev = fs_devices->latest_dev->bdev;
+
+	/*
+	 * From now on the error handling is not straightforward.
+	 *
+	 * If successful, this will transfer the fs_info into the super block,
+	 * and fc->s_fs_info will be NULL.  However if there's an existing
+	 * super, we'll still have fc->s_fs_info populated.  If we error
+	 * completely out it'll be cleaned up when we drop the fs_context,
+	 * otherwise it's tied to the lifetime of the super_block.
+	 */
+	sb = sget_fc(fc, btrfs_fc_test_super, set_anon_super_fc);
+	if (IS_ERR(sb)) {
+		ret = PTR_ERR(sb);
+		goto error;
+	}
+
+	set_device_specific_options(fs_info);
+
+	if (sb->s_root) {
+		btrfs_close_devices(fs_devices);
+		if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY)
+			ret = -EBUSY;
+	} else {
+		snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);
+		shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id);
+		btrfs_sb(sb)->bdev_holder = &btrfs_fs_type;
+		ret = btrfs_fill_super(sb, fs_devices, NULL);
+	}
+
+	if (ret) {
+		deactivate_locked_super(sb);
+		return ret;
+	}
+
+	btrfs_clear_oneshot_options(fs_info);
+
+	fc->root = dget(sb->s_root);
+	return 0;
+
+error:
+	btrfs_close_devices(fs_devices);
+	return ret;
+}
+
+/*
+ * Ever since commit 0723a0473fb4 ("btrfs: allow mounting btrfs subvolumes
+ * with different ro/rw options") the following works:
+ *
+ *        (i) mount /dev/sda3 -o subvol=foo,ro /mnt/foo
+ *       (ii) mount /dev/sda3 -o subvol=bar,rw /mnt/bar
+ *
+ * which looks nice and innocent but is actually pretty intricate and deserves
+ * a long comment.
+ *
+ * On another filesystem a subvolume mount is close to something like:
+ *
+ *	(iii) # create rw superblock + initial mount
+ *	      mount -t xfs /dev/sdb /opt/
+ *
+ *	      # create ro bind mount
+ *	      mount --bind -o ro /opt/foo /mnt/foo
+ *
+ *	      # unmount initial mount
+ *	      umount /opt
+ *
+ * Of course, there's some special subvolume sauce and there's the fact that the
+ * sb->s_root dentry is really swapped after mount_subtree(). But conceptually
+ * it's very close and will help us understand the issue.
+ *
+ * The old mount API didn't cleanly distinguish between a mount being made ro
+ * and a superblock being made ro.  The only way to change the ro state of
+ * either object was by passing ms_rdonly. If a new mount was created via
+ * mount(2) such as:
+ *
+ *      mount("/dev/sdb", "/mnt", "xfs", ms_rdonly, null);
+ *
+ * the MS_RDONLY flag being specified had two effects:
+ *
+ * (1) MNT_READONLY was raised -> the resulting mount got
+ *     @mnt->mnt_flags |= MNT_READONLY raised.
+ *
+ * (2) MS_RDONLY was passed to the filesystem's mount method and the filesystems
+ *     made the superblock ro. Note, how SB_RDONLY has the same value as
+ *     ms_rdonly and is raised whenever MS_RDONLY is passed through mount(2).
+ *
+ * Creating a subtree mount via (iii) ends up leaving a rw superblock with a
+ * subtree mounted ro.
+ *
+ * But consider the effect on the old mount API on btrfs subvolume mounting
+ * which combines the distinct step in (iii) into a single step.
+ *
+ * By issuing (i) both the mount and the superblock are turned ro. Now when (ii)
+ * is issued the superblock is ro and thus even if the mount created for (ii) is
+ * rw it wouldn't help. Hence, btrfs needed to transition the superblock from ro
+ * to rw for (ii) which it did using an internal remount call.
+ *
+ * IOW, subvolume mounting was inherently complicated due to the ambiguity of
+ * MS_RDONLY in mount(2). Note, this ambiguity has mount(8) always translate
+ * "ro" to MS_RDONLY. IOW, in both (i) and (ii) "ro" becomes MS_RDONLY when
+ * passed by mount(8) to mount(2).
+ *
+ * Enter the new mount API. The new mount API disambiguates making a mount ro
+ * and making a superblock ro.
+ *
+ * (3) To turn a mount ro the MOUNT_ATTR_ONLY flag can be used with either
+ *     fsmount() or mount_setattr() this is a pure VFS level change for a
+ *     specific mount or mount tree that is never seen by the filesystem itself.
+ *
+ * (4) To turn a superblock ro the "ro" flag must be used with
+ *     fsconfig(FSCONFIG_SET_FLAG, "ro"). This option is seen by the filesystem
+ *     in fc->sb_flags.
+ *
+ * This disambiguation has rather positive consequences.  Mounting a subvolume
+ * ro will not also turn the superblock ro. Only the mount for the subvolume
+ * will become ro.
+ *
+ * So, if the superblock creation request comes from the new mount API the
+ * caller must have explicitly done:
+ *
+ *      fsconfig(FSCONFIG_SET_FLAG, "ro")
+ *      fsmount/mount_setattr(MOUNT_ATTR_RDONLY)
+ *
+ * IOW, at some point the caller must have explicitly turned the whole
+ * superblock ro and we shouldn't just undo it like we did for the old mount
+ * API. In any case, it lets us avoid the hack in the new mount API.
+ *
+ * Consequently, the remounting hack must only be used for requests originating
+ * from the old mount API and should be marked for full deprecation so it can be
+ * turned off in a couple of years.
+ *
+ * The new mount API has no reason to support this hack.
+ */
+static struct vfsmount *btrfs_reconfigure_for_mount(struct fs_context *fc)
+{
+	struct vfsmount *mnt;
+	int ret;
+	const bool ro2rw = !(fc->sb_flags & SB_RDONLY);
+
+	/*
+	 * We got an EBUSY because our SB_RDONLY flag didn't match the existing
+	 * super block, so invert our setting here and retry the mount so we
+	 * can get our vfsmount.
+	 */
+	if (ro2rw)
+		fc->sb_flags |= SB_RDONLY;
+	else
+		fc->sb_flags &= ~SB_RDONLY;
+
+	mnt = fc_mount(fc);
+	if (IS_ERR(mnt))
+		return mnt;
+
+	if (!fc->oldapi || !ro2rw)
+		return mnt;
+
+	/* We need to convert to rw, call reconfigure. */
+	fc->sb_flags &= ~SB_RDONLY;
+	down_write(&mnt->mnt_sb->s_umount);
+	ret = btrfs_reconfigure(fc);
+	up_write(&mnt->mnt_sb->s_umount);
+	if (ret) {
+		mntput(mnt);
+		return ERR_PTR(ret);
+	}
+	return mnt;
+}
+
+static int btrfs_get_tree_subvol(struct fs_context *fc)
+{
+	struct btrfs_fs_info *fs_info = NULL;
+	struct btrfs_fs_context *ctx = fc->fs_private;
+	struct fs_context *dup_fc;
+	struct dentry *dentry;
+	struct vfsmount *mnt;
+
+	/*
+	 * Setup a dummy root and fs_info for test/set super.  This is because
+	 * we don't actually fill this stuff out until open_ctree, but we need
+	 * then open_ctree will properly initialize the file system specific
+	 * settings later.  btrfs_init_fs_info initializes the static elements
+	 * of the fs_info (locks and such) to make cleanup easier if we find a
+	 * superblock with our given fs_devices later on at sget() time.
+	 */
+	fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL);
+	if (!fs_info)
+		return -ENOMEM;
+
+	fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
+	fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
+	if (!fs_info->super_copy || !fs_info->super_for_commit) {
+		btrfs_free_fs_info(fs_info);
+		return -ENOMEM;
+	}
+	btrfs_init_fs_info(fs_info);
+
+	dup_fc = vfs_dup_fs_context(fc);
+	if (IS_ERR(dup_fc)) {
+		btrfs_free_fs_info(fs_info);
+		return PTR_ERR(dup_fc);
+	}
+
+	/*
+	 * When we do the sget_fc this gets transferred to the sb, so we only
+	 * need to set it on the dup_fc as this is what creates the super block.
+	 */
+	dup_fc->s_fs_info = fs_info;
+
+	/*
+	 * We'll do the security settings in our btrfs_get_tree_super() mount
+	 * loop, they were duplicated into dup_fc, we can drop the originals
+	 * here.
+	 */
+	security_free_mnt_opts(&fc->security);
+	fc->security = NULL;
+
+	mnt = fc_mount(dup_fc);
+	if (PTR_ERR_OR_ZERO(mnt) == -EBUSY)
+		mnt = btrfs_reconfigure_for_mount(dup_fc);
+	put_fs_context(dup_fc);
+	if (IS_ERR(mnt))
+		return PTR_ERR(mnt);
+
+	/*
+	 * This free's ->subvol_name, because if it isn't set we have to
+	 * allocate a buffer to hold the subvol_name, so we just drop our
+	 * reference to it here.
+	 */
+	dentry = mount_subvol(ctx->subvol_name, ctx->subvol_objectid, mnt);
+	ctx->subvol_name = NULL;
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+
+	fc->root = dentry;
+	return 0;
+}
+
+static int btrfs_get_tree(struct fs_context *fc)
+{
+	/*
+	 * Since we use mount_subtree to mount the default/specified subvol, we
+	 * have to do mounts in two steps.
+	 *
+	 * First pass through we call btrfs_get_tree_subvol(), this is just a
+	 * wrapper around fc_mount() to call back into here again, and this time
+	 * we'll call btrfs_get_tree_super().  This will do the open_ctree() and
+	 * everything to open the devices and file system.  Then we return back
+	 * with a fully constructed vfsmount in btrfs_get_tree_subvol(), and
+	 * from there we can do our mount_subvol() call, which will lookup
+	 * whichever subvol we're mounting and setup this fc with the
+	 * appropriate dentry for the subvol.
+	 */
+	if (fc->s_fs_info)
+		return btrfs_get_tree_super(fc);
+	return btrfs_get_tree_subvol(fc);
+}
+
 static void btrfs_kill_super(struct super_block *sb)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -2103,22 +2086,85 @@ static void btrfs_kill_super(struct super_block *sb)
 	btrfs_free_fs_info(fs_info);
 }
 
-static struct file_system_type btrfs_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "btrfs",
-	.mount		= btrfs_mount,
-	.kill_sb	= btrfs_kill_super,
-	.fs_flags	= FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
-};
+static void btrfs_free_fs_context(struct fs_context *fc)
+{
+	struct btrfs_fs_context *ctx = fc->fs_private;
+	struct btrfs_fs_info *fs_info = fc->s_fs_info;
+
+	if (fs_info)
+		btrfs_free_fs_info(fs_info);
+
+	if (ctx && refcount_dec_and_test(&ctx->refs)) {
+		kfree(ctx->subvol_name);
+		kfree(ctx);
+	}
+}
+
+static int btrfs_dup_fs_context(struct fs_context *fc, struct fs_context *src_fc)
+{
+	struct btrfs_fs_context *ctx = src_fc->fs_private;
 
-static struct file_system_type btrfs_root_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "btrfs",
-	.mount		= btrfs_mount_root,
-	.kill_sb	= btrfs_kill_super,
-	.fs_flags	= FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP,
+	/*
+	 * Give a ref to our ctx to this dup, as we want to keep it around for
+	 * our original fc so we can have the subvolume name or objectid.
+	 *
+	 * We unset ->source in the original fc because the dup needs it for
+	 * mounting, and then once we free the dup it'll free ->source, so we
+	 * need to make sure we're only pointing to it in one fc.
+	 */
+	refcount_inc(&ctx->refs);
+	fc->fs_private = ctx;
+	fc->source = src_fc->source;
+	src_fc->source = NULL;
+	return 0;
+}
+
+static const struct fs_context_operations btrfs_fs_context_ops = {
+	.parse_param	= btrfs_parse_param,
+	.reconfigure	= btrfs_reconfigure,
+	.get_tree	= btrfs_get_tree,
+	.dup		= btrfs_dup_fs_context,
+	.free		= btrfs_free_fs_context,
 };
 
+static int btrfs_init_fs_context(struct fs_context *fc)
+{
+	struct btrfs_fs_context *ctx;
+
+	ctx = kzalloc(sizeof(struct btrfs_fs_context), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	refcount_set(&ctx->refs, 1);
+	fc->fs_private = ctx;
+	fc->ops = &btrfs_fs_context_ops;
+
+	if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+		btrfs_info_to_ctx(btrfs_sb(fc->root->d_sb), ctx);
+	} else {
+		ctx->thread_pool_size =
+			min_t(unsigned long, num_online_cpus() + 2, 8);
+		ctx->max_inline = BTRFS_DEFAULT_MAX_INLINE;
+		ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
+	}
+
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+	fc->sb_flags |= SB_POSIXACL;
+#endif
+	fc->sb_flags |= SB_I_VERSION;
+
+	return 0;
+}
+
+static struct file_system_type btrfs_fs_type = {
+	.owner			= THIS_MODULE,
+	.name			= "btrfs",
+	.init_fs_context	= btrfs_init_fs_context,
+	.parameters		= btrfs_fs_parameters,
+	.kill_sb		= btrfs_kill_super,
+	.fs_flags		= FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP,
+ };
+
 MODULE_ALIAS_FS("btrfs");
 
 static int btrfs_control_open(struct inode *inode, struct file *file)
@@ -2330,7 +2376,6 @@ static const struct super_operations btrfs_super_ops = {
 	.destroy_inode	= btrfs_destroy_inode,
 	.free_inode	= btrfs_free_inode,
 	.statfs		= btrfs_statfs,
-	.remount_fs	= btrfs_remount,
 	.freeze_fs	= btrfs_freeze,
 	.unfreeze_fs	= btrfs_unfreeze,
 };
diff --git a/fs/btrfs/super.h b/fs/btrfs/super.h
index 8dbb909b364f..f18253ca280d 100644
--- a/fs/btrfs/super.h
+++ b/fs/btrfs/super.h
@@ -3,11 +3,12 @@
 #ifndef BTRFS_SUPER_H
 #define BTRFS_SUPER_H
 
-int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
-			unsigned long new_flags);
+bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt,
+			 unsigned long flags);
 int btrfs_sync_fs(struct super_block *sb, int wait);
 char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
 					  u64 subvol_objectid);
+void btrfs_set_free_space_cache_settings(struct btrfs_fs_info *fs_info);
 
 static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
 {
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index e6b51fb3ddc1..84c05246ffd8 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1783,6 +1783,10 @@ static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj,
 	unsigned long long limit;
 
 	limit = memparse(buf, &endptr);
+	/* There could be trailing '\n', also catch any typos after the value. */
+	endptr = skip_spaces(endptr);
+	if (*endptr != 0)
+		return -EINVAL;
 	WRITE_ONCE(device->scrub_speed_max, limit);
 	return len;
 }
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index ca09cf9afce8..709c6cc9706a 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -28,6 +28,7 @@ const char *test_error[] = {
 	[TEST_ALLOC_INODE]	     = "cannot allocate inode",
 	[TEST_ALLOC_BLOCK_GROUP]     = "cannot allocate block group",
 	[TEST_ALLOC_EXTENT_MAP]      = "cannot allocate extent map",
+	[TEST_ALLOC_CHUNK_MAP]       = "cannot allocate chunk map",
 };
 
 static const struct super_operations btrfs_test_super_ops = {
@@ -102,7 +103,7 @@ struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info)
 	if (!dev)
 		return ERR_PTR(-ENOMEM);
 
-	extent_io_tree_init(NULL, &dev->alloc_state, 0);
+	extent_io_tree_init(fs_info, &dev->alloc_state, 0);
 	INIT_LIST_HEAD(&dev->dev_list);
 	list_add(&dev->dev_list, &fs_info->fs_devices->devices);
 
@@ -185,7 +186,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
 	}
 	spin_unlock(&fs_info->buffer_lock);
 
-	btrfs_mapping_tree_free(&fs_info->mapping_tree);
+	btrfs_mapping_tree_free(fs_info);
 	list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices,
 				 dev_list) {
 		btrfs_free_dummy_device(dev);
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index 7a2d7ffbe30e..dc2f2ab15fa5 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -23,6 +23,7 @@ enum {
 	TEST_ALLOC_INODE,
 	TEST_ALLOC_BLOCK_GROUP,
 	TEST_ALLOC_EXTENT_MAP,
+	TEST_ALLOC_CHUNK_MAP,
 };
 
 extern const char *test_error[];
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 1cc86af97dc6..25b3349595e0 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -652,7 +652,7 @@ static void dump_eb_and_memory_contents(struct extent_buffer *eb, void *memory,
 					const char *test_name)
 {
 	for (int i = 0; i < eb->len; i++) {
-		struct page *page = eb->pages[i >> PAGE_SHIFT];
+		struct page *page = folio_page(eb->folios[i >> PAGE_SHIFT], 0);
 		void *addr = page_address(page) + offset_in_page(i);
 
 		if (memcmp(addr, memory + i, 1) != 0) {
@@ -668,7 +668,7 @@ static int verify_eb_and_memory(struct extent_buffer *eb, void *memory,
 				const char *test_name)
 {
 	for (int i = 0; i < (eb->len >> PAGE_SHIFT); i++) {
-		void *eb_addr = page_address(eb->pages[i]);
+		void *eb_addr = folio_address(eb->folios[i]);
 
 		if (memcmp(memory + (i << PAGE_SHIFT), eb_addr, PAGE_SIZE) != 0) {
 			dump_eb_and_memory_contents(eb, memory, test_name);
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 29bdd08b241f..253cce7ffecf 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -25,7 +25,7 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree)
 #ifdef CONFIG_BTRFS_DEBUG
 		if (refcount_read(&em->refs) != 1) {
 			test_err(
-"em leak: em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx) refs %d",
+"em leak: em (start %llu len %llu block_start %llu block_len %llu) refs %d",
 				 em->start, em->len, em->block_start,
 				 em->block_len, refcount_read(&em->refs));
 
@@ -73,7 +73,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info,
 	em->block_start = 0;
 	em->block_len = SZ_16K;
 	write_lock(&em_tree->lock);
-	ret = add_extent_mapping(em_tree, em, 0);
+	ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
 	write_unlock(&em_tree->lock);
 	if (ret < 0) {
 		test_err("cannot add extent range [0, 16K)");
@@ -94,7 +94,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info,
 	em->block_start = SZ_32K; /* avoid merging */
 	em->block_len = SZ_4K;
 	write_lock(&em_tree->lock);
-	ret = add_extent_mapping(em_tree, em, 0);
+	ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
 	write_unlock(&em_tree->lock);
 	if (ret < 0) {
 		test_err("cannot add extent range [16K, 20K)");
@@ -121,9 +121,14 @@ static int test_case_1(struct btrfs_fs_info *fs_info,
 		test_err("case1 [%llu %llu]: ret %d", start, start + len, ret);
 		goto out;
 	}
-	if (em &&
-	    (em->start != 0 || extent_map_end(em) != SZ_16K ||
-	     em->block_start != 0 || em->block_len != SZ_16K)) {
+	if (!em) {
+		test_err("case1 [%llu %llu]: no extent map returned",
+			 start, start + len);
+		ret = -ENOENT;
+		goto out;
+	}
+	if (em->start != 0 || extent_map_end(em) != SZ_16K ||
+	    em->block_start != 0 || em->block_len != SZ_16K) {
 		test_err(
 "case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu",
 			 start, start + len, ret, em->start, em->len,
@@ -161,7 +166,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info,
 	em->block_start = EXTENT_MAP_INLINE;
 	em->block_len = (u64)-1;
 	write_lock(&em_tree->lock);
-	ret = add_extent_mapping(em_tree, em, 0);
+	ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
 	write_unlock(&em_tree->lock);
 	if (ret < 0) {
 		test_err("cannot add extent range [0, 1K)");
@@ -182,7 +187,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info,
 	em->block_start = SZ_4K;
 	em->block_len = SZ_4K;
 	write_lock(&em_tree->lock);
-	ret = add_extent_mapping(em_tree, em, 0);
+	ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
 	write_unlock(&em_tree->lock);
 	if (ret < 0) {
 		test_err("cannot add extent range [4K, 8K)");
@@ -209,9 +214,13 @@ static int test_case_2(struct btrfs_fs_info *fs_info,
 		test_err("case2 [0 1K]: ret %d", ret);
 		goto out;
 	}
-	if (em &&
-	    (em->start != 0 || extent_map_end(em) != SZ_1K ||
-	     em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1)) {
+	if (!em) {
+		test_err("case2 [0 1K]: no extent map returned");
+		ret = -ENOENT;
+		goto out;
+	}
+	if (em->start != 0 || extent_map_end(em) != SZ_1K ||
+	    em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1) {
 		test_err(
 "case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu",
 			 ret, em->start, em->len, em->block_start,
@@ -244,7 +253,7 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
 	em->block_start = SZ_4K;
 	em->block_len = SZ_4K;
 	write_lock(&em_tree->lock);
-	ret = add_extent_mapping(em_tree, em, 0);
+	ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
 	write_unlock(&em_tree->lock);
 	if (ret < 0) {
 		test_err("cannot add extent range [4K, 8K)");
@@ -268,19 +277,24 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
 	ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
 	write_unlock(&em_tree->lock);
 	if (ret) {
-		test_err("case3 [0x%llx 0x%llx): ret %d",
+		test_err("case3 [%llu %llu): ret %d",
 			 start, start + len, ret);
 		goto out;
 	}
+	if (!em) {
+		test_err("case3 [%llu %llu): no extent map returned",
+			 start, start + len);
+		ret = -ENOENT;
+		goto out;
+	}
 	/*
 	 * Since bytes within em are contiguous, em->block_start is identical to
 	 * em->start.
 	 */
-	if (em &&
-	    (start < em->start || start + len > extent_map_end(em) ||
-	     em->start != em->block_start || em->len != em->block_len)) {
+	if (start < em->start || start + len > extent_map_end(em) ||
+	    em->start != em->block_start || em->len != em->block_len) {
 		test_err(
-"case3 [0x%llx 0x%llx): ret %d em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)",
+"case3 [%llu %llu): ret %d em (start %llu len %llu block_start %llu block_len %llu)",
 			 start, start + len, ret, em->start, em->len,
 			 em->block_start, em->block_len);
 		ret = -EINVAL;
@@ -343,7 +357,7 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
 	em->block_start = 0;
 	em->block_len = SZ_8K;
 	write_lock(&em_tree->lock);
-	ret = add_extent_mapping(em_tree, em, 0);
+	ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
 	write_unlock(&em_tree->lock);
 	if (ret < 0) {
 		test_err("cannot add extent range [0, 8K)");
@@ -364,7 +378,7 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
 	em->block_start = SZ_16K; /* avoid merging */
 	em->block_len = 24 * SZ_1K;
 	write_lock(&em_tree->lock);
-	ret = add_extent_mapping(em_tree, em, 0);
+	ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
 	write_unlock(&em_tree->lock);
 	if (ret < 0) {
 		test_err("cannot add extent range [8K, 32K)");
@@ -387,14 +401,20 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
 	ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
 	write_unlock(&em_tree->lock);
 	if (ret) {
-		test_err("case4 [0x%llx 0x%llx): ret %d",
-			 start, len, ret);
+		test_err("case4 [%llu %llu): ret %d",
+			 start, start + len, ret);
 		goto out;
 	}
-	if (em && (start < em->start || start + len > extent_map_end(em))) {
+	if (!em) {
+		test_err("case4 [%llu %llu): no extent map returned",
+			 start, start + len);
+		ret = -ENOENT;
+		goto out;
+	}
+	if (start < em->start || start + len > extent_map_end(em)) {
 		test_err(
-"case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)",
-			 start, len, ret, em->start, em->len, em->block_start,
+"case4 [%llu %llu): ret %d, added wrong em (start %llu len %llu block_start %llu block_len %llu)",
+			 start, start + len, ret, em->start, em->len, em->block_start,
 			 em->block_len);
 		ret = -EINVAL;
 	}
@@ -443,7 +463,8 @@ static int test_case_4(struct btrfs_fs_info *fs_info,
 	return ret;
 }
 
-static int add_compressed_extent(struct extent_map_tree *em_tree,
+static int add_compressed_extent(struct btrfs_fs_info *fs_info,
+				 struct extent_map_tree *em_tree,
 				 u64 start, u64 len, u64 block_start)
 {
 	struct extent_map *em;
@@ -459,9 +480,9 @@ static int add_compressed_extent(struct extent_map_tree *em_tree,
 	em->len = len;
 	em->block_start = block_start;
 	em->block_len = SZ_4K;
-	set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+	em->flags |= EXTENT_FLAG_COMPRESS_ZLIB;
 	write_lock(&em_tree->lock);
-	ret = add_extent_mapping(em_tree, em, 0);
+	ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
 	write_unlock(&em_tree->lock);
 	free_extent_map(em);
 	if (ret < 0) {
@@ -567,7 +588,7 @@ static int validate_range(struct extent_map_tree *em_tree, int index)
  * They'll have the EXTENT_FLAG_COMPRESSED flag set to keep the em tree from
  * merging the em's.
  */
-static int test_case_5(void)
+static int test_case_5(struct btrfs_fs_info *fs_info)
 {
 	struct extent_map_tree *em_tree;
 	struct inode *inode;
@@ -585,35 +606,35 @@ static int test_case_5(void)
 	em_tree = &BTRFS_I(inode)->extent_tree;
 
 	/* [0, 12k) */
-	ret = add_compressed_extent(em_tree, 0, SZ_4K * 3, 0);
+	ret = add_compressed_extent(fs_info, em_tree, 0, SZ_4K * 3, 0);
 	if (ret) {
 		test_err("cannot add extent range [0, 12K)");
 		goto out;
 	}
 
 	/* [12k, 24k) */
-	ret = add_compressed_extent(em_tree, SZ_4K * 3, SZ_4K * 3, SZ_4K);
+	ret = add_compressed_extent(fs_info, em_tree, SZ_4K * 3, SZ_4K * 3, SZ_4K);
 	if (ret) {
 		test_err("cannot add extent range [12k, 24k)");
 		goto out;
 	}
 
 	/* [24k, 36k) */
-	ret = add_compressed_extent(em_tree, SZ_4K * 6, SZ_4K * 3, SZ_8K);
+	ret = add_compressed_extent(fs_info, em_tree, SZ_4K * 6, SZ_4K * 3, SZ_8K);
 	if (ret) {
 		test_err("cannot add extent range [12k, 24k)");
 		goto out;
 	}
 
 	/* [36k, 40k) */
-	ret = add_compressed_extent(em_tree, SZ_32K + SZ_4K, SZ_4K, SZ_4K * 3);
+	ret = add_compressed_extent(fs_info, em_tree, SZ_32K + SZ_4K, SZ_4K, SZ_4K * 3);
 	if (ret) {
 		test_err("cannot add extent range [12k, 24k)");
 		goto out;
 	}
 
 	/* [40k, 64k) */
-	ret = add_compressed_extent(em_tree, SZ_4K * 10, SZ_4K * 6, SZ_16K);
+	ret = add_compressed_extent(fs_info, em_tree, SZ_4K * 10, SZ_4K * 6, SZ_16K);
 	if (ret) {
 		test_err("cannot add extent range [12k, 24k)");
 		goto out;
@@ -665,11 +686,11 @@ static int test_case_6(struct btrfs_fs_info *fs_info, struct extent_map_tree *em
 	struct extent_map *em = NULL;
 	int ret;
 
-	ret = add_compressed_extent(em_tree, 0, SZ_4K, 0);
+	ret = add_compressed_extent(fs_info, em_tree, 0, SZ_4K, 0);
 	if (ret)
 		goto out;
 
-	ret = add_compressed_extent(em_tree, SZ_4K, SZ_4K, 0);
+	ret = add_compressed_extent(fs_info, em_tree, SZ_4K, SZ_4K, 0);
 	if (ret)
 		goto out;
 
@@ -713,7 +734,7 @@ out:
  * true would mess up the start/end calculations and subsequent splits would be
  * incorrect.
  */
-static int test_case_7(void)
+static int test_case_7(struct btrfs_fs_info *fs_info)
 {
 	struct extent_map_tree *em_tree;
 	struct extent_map *em;
@@ -742,9 +763,9 @@ static int test_case_7(void)
 	em->len = SZ_16K;
 	em->block_start = 0;
 	em->block_len = SZ_4K;
-	set_bit(EXTENT_FLAG_PINNED, &em->flags);
+	em->flags |= EXTENT_FLAG_PINNED;
 	write_lock(&em_tree->lock);
-	ret = add_extent_mapping(em_tree, em, 0);
+	ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
 	write_unlock(&em_tree->lock);
 	if (ret < 0) {
 		test_err("couldn't add extent map");
@@ -765,7 +786,7 @@ static int test_case_7(void)
 	em->block_start = SZ_32K;
 	em->block_len = SZ_16K;
 	write_lock(&em_tree->lock);
-	ret = add_extent_mapping(em_tree, em, 0);
+	ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
 	write_unlock(&em_tree->lock);
 	if (ret < 0) {
 		test_err("couldn't add extent map");
@@ -859,33 +880,21 @@ struct rmap_test_vector {
 static int test_rmap_block(struct btrfs_fs_info *fs_info,
 			   struct rmap_test_vector *test)
 {
-	struct extent_map *em;
-	struct map_lookup *map = NULL;
+	struct btrfs_chunk_map *map;
 	u64 *logical = NULL;
 	int i, out_ndaddrs, out_stripe_len;
 	int ret;
 
-	em = alloc_extent_map();
-	if (!em) {
-		test_std_err(TEST_ALLOC_EXTENT_MAP);
-		return -ENOMEM;
-	}
-
-	map = kmalloc(map_lookup_size(test->num_stripes), GFP_KERNEL);
+	map = btrfs_alloc_chunk_map(test->num_stripes, GFP_KERNEL);
 	if (!map) {
-		kfree(em);
-		test_std_err(TEST_ALLOC_EXTENT_MAP);
+		test_std_err(TEST_ALLOC_CHUNK_MAP);
 		return -ENOMEM;
 	}
 
-	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
 	/* Start at 4GiB logical address */
-	em->start = SZ_4G;
-	em->len = test->data_stripe_size * test->num_data_stripes;
-	em->block_len = em->len;
-	em->orig_block_len = test->data_stripe_size;
-	em->map_lookup = map;
-
+	map->start = SZ_4G;
+	map->chunk_len = test->data_stripe_size * test->num_data_stripes;
+	map->stripe_size = test->data_stripe_size;
 	map->num_stripes = test->num_stripes;
 	map->type = test->raid_type;
 
@@ -901,15 +910,13 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info,
 		map->stripes[i].physical = test->data_stripe_phys_start[i];
 	}
 
-	write_lock(&fs_info->mapping_tree.lock);
-	ret = add_extent_mapping(&fs_info->mapping_tree, em, 0);
-	write_unlock(&fs_info->mapping_tree.lock);
+	ret = btrfs_add_chunk_map(fs_info, map);
 	if (ret) {
-		test_err("error adding block group mapping to mapping tree");
+		test_err("error adding chunk map to mapping tree");
 		goto out_free;
 	}
 
-	ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1),
+	ret = btrfs_rmap_block(fs_info, map->start, btrfs_sb_offset(1),
 			       &logical, &out_ndaddrs, &out_stripe_len);
 	if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) {
 		test_err("didn't rmap anything but expected %d",
@@ -938,14 +945,8 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info,
 
 	ret = 0;
 out:
-	write_lock(&fs_info->mapping_tree.lock);
-	remove_extent_mapping(&fs_info->mapping_tree, em);
-	write_unlock(&fs_info->mapping_tree.lock);
-	/* For us */
-	free_extent_map(em);
+	btrfs_remove_chunk_map(fs_info, map);
 out_free:
-	/* For the tree */
-	free_extent_map(em);
 	kfree(logical);
 	return ret;
 }
@@ -1022,13 +1023,13 @@ int btrfs_test_extent_map(void)
 	ret = test_case_4(fs_info, em_tree);
 	if (ret)
 		goto out;
-	ret = test_case_5();
+	ret = test_case_5(fs_info);
 	if (ret)
 		goto out;
 	ret = test_case_6(fs_info, em_tree);
 	if (ret)
 		goto out;
-	ret = test_case_7();
+	ret = test_case_7(fs_info);
 	if (ret)
 		goto out;
 
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 492d69d2fa73..9957de9f7806 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -211,9 +211,9 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize)
 		      sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot);
 }
 
-static unsigned long prealloc_only = 0;
-static unsigned long compressed_only = 0;
-static unsigned long vacancy_only = 0;
+static u32 prealloc_only = 0;
+static u32 compressed_only = 0;
+static u32 vacancy_only = 0;
 
 static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 {
@@ -305,7 +305,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 		goto out;
 	}
 	if (em->flags != 0) {
-		test_err("unexpected flags set, want 0 have %lu", em->flags);
+		test_err("unexpected flags set, want 0 have %u", em->flags);
 		goto out;
 	}
 	/*
@@ -332,7 +332,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 		goto out;
 	}
 	if (em->flags != 0) {
-		test_err("unexpected flags set, want 0 have %lu", em->flags);
+		test_err("unexpected flags set, want 0 have %u", em->flags);
 		goto out;
 	}
 	offset = em->start + em->len;
@@ -355,7 +355,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 		goto out;
 	}
 	if (em->flags != 0) {
-		test_err("unexpected flags set, want 0 have %lu", em->flags);
+		test_err("unexpected flags set, want 0 have %u", em->flags);
 		goto out;
 	}
 	if (em->orig_start != em->start) {
@@ -383,7 +383,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 		goto out;
 	}
 	if (em->flags != 0) {
-		test_err("unexpected flags set, want 0 have %lu", em->flags);
+		test_err("unexpected flags set, want 0 have %u", em->flags);
 		goto out;
 	}
 	if (em->orig_start != em->start) {
@@ -412,7 +412,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 		goto out;
 	}
 	if (em->flags != 0) {
-		test_err("unexpected flags set, want 0 have %lu", em->flags);
+		test_err("unexpected flags set, want 0 have %u", em->flags);
 		goto out;
 	}
 	offset = em->start + em->len;
@@ -434,7 +434,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 		goto out;
 	}
 	if (em->flags != 0) {
-		test_err("unexpected flags set, want 0 have %lu", em->flags);
+		test_err("unexpected flags set, want 0 have %u", em->flags);
 		goto out;
 	}
 	if (em->orig_start != orig_start) {
@@ -468,7 +468,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 		goto out;
 	}
 	if (em->flags != prealloc_only) {
-		test_err("unexpected flags set, want %lu have %lu",
+		test_err("unexpected flags set, want %u have %u",
 			 prealloc_only, em->flags);
 		goto out;
 	}
@@ -497,7 +497,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 		goto out;
 	}
 	if (em->flags != prealloc_only) {
-		test_err("unexpected flags set, want %lu have %lu",
+		test_err("unexpected flags set, want %u have %u",
 			 prealloc_only, em->flags);
 		goto out;
 	}
@@ -527,7 +527,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 		goto out;
 	}
 	if (em->flags != 0) {
-		test_err("unexpected flags set, want 0 have %lu", em->flags);
+		test_err("unexpected flags set, want 0 have %u", em->flags);
 		goto out;
 	}
 	if (em->orig_start != orig_start) {
@@ -560,7 +560,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 		goto out;
 	}
 	if (em->flags != prealloc_only) {
-		test_err("unexpected flags set, want %lu have %lu",
+		test_err("unexpected flags set, want %u have %u",
 			 prealloc_only, em->flags);
 		goto out;
 	}
@@ -595,7 +595,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 		goto out;
 	}
 	if (em->flags != compressed_only) {
-		test_err("unexpected flags set, want %lu have %lu",
+		test_err("unexpected flags set, want %u have %u",
 			 compressed_only, em->flags);
 		goto out;
 	}
@@ -604,9 +604,9 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 			 em->start, em->orig_start);
 		goto out;
 	}
-	if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+	if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
 		test_err("unexpected compress type, wanted %d, got %d",
-			 BTRFS_COMPRESS_ZLIB, em->compress_type);
+			 BTRFS_COMPRESS_ZLIB, extent_map_compression(em));
 		goto out;
 	}
 	offset = em->start + em->len;
@@ -629,7 +629,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 		goto out;
 	}
 	if (em->flags != compressed_only) {
-		test_err("unexpected flags set, want %lu have %lu",
+		test_err("unexpected flags set, want %u have %u",
 			 compressed_only, em->flags);
 		goto out;
 	}
@@ -638,9 +638,9 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 			 em->start, em->orig_start);
 		goto out;
 	}
-	if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+	if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
 		test_err("unexpected compress type, wanted %d, got %d",
-			 BTRFS_COMPRESS_ZLIB, em->compress_type);
+			 BTRFS_COMPRESS_ZLIB, extent_map_compression(em));
 		goto out;
 	}
 	disk_bytenr = em->block_start;
@@ -664,7 +664,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 		goto out;
 	}
 	if (em->flags != 0) {
-		test_err("unexpected flags set, want 0 have %lu", em->flags);
+		test_err("unexpected flags set, want 0 have %u", em->flags);
 		goto out;
 	}
 	if (em->orig_start != em->start) {
@@ -692,7 +692,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 		goto out;
 	}
 	if (em->flags != compressed_only) {
-		test_err("unexpected flags set, want %lu have %lu",
+		test_err("unexpected flags set, want %u have %u",
 			 compressed_only, em->flags);
 		goto out;
 	}
@@ -701,9 +701,9 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 			 em->start, orig_start);
 		goto out;
 	}
-	if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+	if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
 		test_err("unexpected compress type, wanted %d, got %d",
-			 BTRFS_COMPRESS_ZLIB, em->compress_type);
+			 BTRFS_COMPRESS_ZLIB, extent_map_compression(em));
 		goto out;
 	}
 	offset = em->start + em->len;
@@ -726,7 +726,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 		goto out;
 	}
 	if (em->flags != 0) {
-		test_err("unexpected flags set, want 0 have %lu", em->flags);
+		test_err("unexpected flags set, want 0 have %u", em->flags);
 		goto out;
 	}
 	if (em->orig_start != em->start) {
@@ -758,7 +758,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 		goto out;
 	}
 	if (em->flags != vacancy_only) {
-		test_err("unexpected flags set, want %lu have %lu",
+		test_err("unexpected flags set, want %u have %u",
 			 vacancy_only, em->flags);
 		goto out;
 	}
@@ -786,7 +786,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 		goto out;
 	}
 	if (em->flags != 0) {
-		test_err("unexpected flags set, want 0 have %lu", em->flags);
+		test_err("unexpected flags set, want 0 have %u", em->flags);
 		goto out;
 	}
 	if (em->orig_start != em->start) {
@@ -866,7 +866,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
 		goto out;
 	}
 	if (em->flags != vacancy_only) {
-		test_err("wrong flags, wanted %lu, have %lu", vacancy_only,
+		test_err("wrong flags, wanted %u, have %u", vacancy_only,
 			 em->flags);
 		goto out;
 	}
@@ -888,7 +888,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
 		goto out;
 	}
 	if (em->flags != 0) {
-		test_err("unexpected flags set, wanted 0 got %lu",
+		test_err("unexpected flags set, wanted 0 got %u",
 			 em->flags);
 		goto out;
 	}
@@ -1095,8 +1095,8 @@ int btrfs_test_inodes(u32 sectorsize, u32 nodesize)
 
 	test_msg("running inode tests");
 
-	set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only);
-	set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only);
+	compressed_only |= EXTENT_FLAG_COMPRESS_ZLIB;
+	prealloc_only |= EXTENT_FLAG_PREALLOC;
 
 	ret = test_btrfs_get_extent(sectorsize, nodesize);
 	if (ret)
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index 3c2a02a72f64..14b9fbe82da4 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -22,7 +22,7 @@ struct btrfs_tree_parent_check {
 
 	/*
 	 * Expected transid, can be 0 to skip the check, but such skip
-	 * should only be utlized for backref walk related code.
+	 * should only be utilized for backref walk related code.
 	 */
 	u64 transid;
 
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 7d6729d9fd2f..331fc7429952 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2575,7 +2575,6 @@ static int clean_log_buffer(struct btrfs_trans_handle *trans,
 		ret = btrfs_pin_reserved_extent(trans, eb);
 		if (ret)
 			return ret;
-		btrfs_redirty_list_add(trans->transaction, eb);
 	} else {
 		unaccount_log_buffer(eb->fs_info, eb->start);
 	}
@@ -4520,7 +4519,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
 	int ret = 0;
 
 	if (inode->flags & BTRFS_INODE_NODATASUM ||
-	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+	    (em->flags & EXTENT_FLAG_PREALLOC) ||
 	    em->block_start == EXTENT_MAP_HOLE)
 		return 0;
 
@@ -4583,7 +4582,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
 		return 0;
 
 	/* If we're compressed we have to save the entire range of csums. */
-	if (em->compress_type) {
+	if (extent_map_is_compressed(em)) {
 		csum_offset = 0;
 		csum_len = max(em->block_len, em->orig_block_len);
 	} else {
@@ -4623,18 +4622,20 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_file_extent_item fi = { 0 };
 	struct extent_buffer *leaf;
 	struct btrfs_key key;
+	enum btrfs_compression_type compress_type;
 	u64 extent_offset = em->start - em->orig_start;
 	u64 block_len;
 	int ret;
 
 	btrfs_set_stack_file_extent_generation(&fi, trans->transid);
-	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+	if (em->flags & EXTENT_FLAG_PREALLOC)
 		btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC);
 	else
 		btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
 
 	block_len = max(em->block_len, em->orig_block_len);
-	if (em->compress_type != BTRFS_COMPRESS_NONE) {
+	compress_type = extent_map_compression(em);
+	if (compress_type != BTRFS_COMPRESS_NONE) {
 		btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start);
 		btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
 	} else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
@@ -4646,7 +4647,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_stack_file_extent_offset(&fi, extent_offset);
 	btrfs_set_stack_file_extent_num_bytes(&fi, em->len);
 	btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes);
-	btrfs_set_stack_file_extent_compression(&fi, em->compress_type);
+	btrfs_set_stack_file_extent_compression(&fi, compress_type);
 
 	ret = log_extent_csums(trans, inode, log, em, ctx);
 	if (ret)
@@ -4859,13 +4860,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 			continue;
 
 		/* We log prealloc extents beyond eof later. */
-		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
+		if ((em->flags & EXTENT_FLAG_PREALLOC) &&
 		    em->start >= i_size_read(&inode->vfs_inode))
 			continue;
 
 		/* Need a ref to keep it from getting evicted from cache */
 		refcount_inc(&em->refs);
-		set_bit(EXTENT_FLAG_LOGGING, &em->flags);
+		em->flags |= EXTENT_FLAG_LOGGING;
 		list_add_tail(&em->list, &extents);
 		num++;
 	}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f627674b37db..4c32497311d2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -41,6 +41,17 @@
 					 BTRFS_BLOCK_GROUP_RAID10 | \
 					 BTRFS_BLOCK_GROUP_RAID56_MASK)
 
+struct btrfs_io_geometry {
+	u32 stripe_index;
+	u32 stripe_nr;
+	int mirror_num;
+	int num_stripes;
+	u64 stripe_offset;
+	u64 raid56_full_stripe_start;
+	int max_errors;
+	enum btrfs_map_op op;
+};
+
 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 	[BTRFS_RAID_RAID10] = {
 		.sub_stripes	= 2,
@@ -1742,19 +1753,18 @@ out:
 
 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
 {
-	struct extent_map_tree *em_tree;
-	struct extent_map *em;
 	struct rb_node *n;
 	u64 ret = 0;
 
-	em_tree = &fs_info->mapping_tree;
-	read_lock(&em_tree->lock);
-	n = rb_last(&em_tree->map.rb_root);
+	read_lock(&fs_info->mapping_tree_lock);
+	n = rb_last(&fs_info->mapping_tree.rb_root);
 	if (n) {
-		em = rb_entry(n, struct extent_map, rb_node);
-		ret = em->start + em->len;
+		struct btrfs_chunk_map *map;
+
+		map = rb_entry(n, struct btrfs_chunk_map, rb_node);
+		ret = map->start + map->chunk_len;
 	}
-	read_unlock(&em_tree->lock);
+	read_unlock(&fs_info->mapping_tree_lock);
 
 	return ret;
 }
@@ -2986,6 +2996,81 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
 	return ret;
 }
 
+struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_info,
+						    u64 logical, u64 length)
+{
+	struct rb_node *node = fs_info->mapping_tree.rb_root.rb_node;
+	struct rb_node *prev = NULL;
+	struct rb_node *orig_prev;
+	struct btrfs_chunk_map *map;
+	struct btrfs_chunk_map *prev_map = NULL;
+
+	while (node) {
+		map = rb_entry(node, struct btrfs_chunk_map, rb_node);
+		prev = node;
+		prev_map = map;
+
+		if (logical < map->start) {
+			node = node->rb_left;
+		} else if (logical >= map->start + map->chunk_len) {
+			node = node->rb_right;
+		} else {
+			refcount_inc(&map->refs);
+			return map;
+		}
+	}
+
+	if (!prev)
+		return NULL;
+
+	orig_prev = prev;
+	while (prev && logical >= prev_map->start + prev_map->chunk_len) {
+		prev = rb_next(prev);
+		prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
+	}
+
+	if (!prev) {
+		prev = orig_prev;
+		prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
+		while (prev && logical < prev_map->start) {
+			prev = rb_prev(prev);
+			prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
+		}
+	}
+
+	if (prev) {
+		u64 end = logical + length;
+
+		/*
+		 * Caller can pass a U64_MAX length when it wants to get any
+		 * chunk starting at an offset of 'logical' or higher, so deal
+		 * with underflow by resetting the end offset to U64_MAX.
+		 */
+		if (end < logical)
+			end = U64_MAX;
+
+		if (end > prev_map->start &&
+		    logical < prev_map->start + prev_map->chunk_len) {
+			refcount_inc(&prev_map->refs);
+			return prev_map;
+		}
+	}
+
+	return NULL;
+}
+
+struct btrfs_chunk_map *btrfs_find_chunk_map(struct btrfs_fs_info *fs_info,
+					     u64 logical, u64 length)
+{
+	struct btrfs_chunk_map *map;
+
+	read_lock(&fs_info->mapping_tree_lock);
+	map = btrfs_find_chunk_map_nolock(fs_info, logical, length);
+	read_unlock(&fs_info->mapping_tree_lock);
+
+	return map;
+}
+
 /*
  * Find the mapping containing the given logical extent.
  *
@@ -2994,38 +3079,37 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
  *
  * Return: Chunk mapping or ERR_PTR.
  */
-struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
-				       u64 logical, u64 length)
+struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
+					    u64 logical, u64 length)
 {
-	struct extent_map_tree *em_tree;
-	struct extent_map *em;
+	struct btrfs_chunk_map *map;
 
-	em_tree = &fs_info->mapping_tree;
-	read_lock(&em_tree->lock);
-	em = lookup_extent_mapping(em_tree, logical, length);
-	read_unlock(&em_tree->lock);
+	map = btrfs_find_chunk_map(fs_info, logical, length);
 
-	if (!em) {
+	if (unlikely(!map)) {
+		read_unlock(&fs_info->mapping_tree_lock);
 		btrfs_crit(fs_info,
 			   "unable to find chunk map for logical %llu length %llu",
 			   logical, length);
 		return ERR_PTR(-EINVAL);
 	}
 
-	if (em->start > logical || em->start + em->len <= logical) {
+	if (unlikely(map->start > logical || map->start + map->chunk_len <= logical)) {
+		read_unlock(&fs_info->mapping_tree_lock);
 		btrfs_crit(fs_info,
 			   "found a bad chunk map, wanted %llu-%llu, found %llu-%llu",
-			   logical, logical + length, em->start, em->start + em->len);
-		free_extent_map(em);
+			   logical, logical + length, map->start,
+			   map->start + map->chunk_len);
+		btrfs_free_chunk_map(map);
 		return ERR_PTR(-EINVAL);
 	}
 
-	/* callers are responsible for dropping em's ref. */
-	return em;
+	/* Callers are responsible for dropping the reference. */
+	return map;
 }
 
 static int remove_chunk_item(struct btrfs_trans_handle *trans,
-			     struct map_lookup *map, u64 chunk_offset)
+			     struct btrfs_chunk_map *map, u64 chunk_offset)
 {
 	int i;
 
@@ -3050,23 +3134,21 @@ static int remove_chunk_item(struct btrfs_trans_handle *trans,
 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
-	struct extent_map *em;
-	struct map_lookup *map;
+	struct btrfs_chunk_map *map;
 	u64 dev_extent_len = 0;
 	int i, ret = 0;
 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 
-	em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
-	if (IS_ERR(em)) {
+	map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
+	if (IS_ERR(map)) {
 		/*
 		 * This is a logic error, but we don't want to just rely on the
 		 * user having built with ASSERT enabled, so if ASSERT doesn't
 		 * do anything we still error out.
 		 */
 		ASSERT(0);
-		return PTR_ERR(em);
+		return PTR_ERR(map);
 	}
-	map = em->map_lookup;
 
 	/*
 	 * First delete the device extent items from the devices btree.
@@ -3169,7 +3251,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
 		goto out;
 	}
 
-	trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
+	trace_btrfs_chunk_free(fs_info, map, chunk_offset, map->chunk_len);
 
 	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
 		ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
@@ -3188,7 +3270,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
 	 */
 	btrfs_trans_release_chunk_metadata(trans);
 
-	ret = btrfs_remove_block_group(trans, chunk_offset, em);
+	ret = btrfs_remove_block_group(trans, map);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
@@ -3200,7 +3282,7 @@ out:
 		trans->removing_chunk = false;
 	}
 	/* once for us */
-	free_extent_map(em);
+	btrfs_free_chunk_map(map);
 	return ret;
 }
 
@@ -5347,24 +5429,131 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
 	}
 }
 
+static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int bits)
+{
+	for (int i = 0; i < map->num_stripes; i++) {
+		struct btrfs_io_stripe *stripe = &map->stripes[i];
+		struct btrfs_device *device = stripe->dev;
+
+		set_extent_bit(&device->alloc_state, stripe->physical,
+			       stripe->physical + map->stripe_size - 1,
+			       bits | EXTENT_NOWAIT, NULL);
+	}
+}
+
+static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits)
+{
+	for (int i = 0; i < map->num_stripes; i++) {
+		struct btrfs_io_stripe *stripe = &map->stripes[i];
+		struct btrfs_device *device = stripe->dev;
+
+		__clear_extent_bit(&device->alloc_state, stripe->physical,
+				   stripe->physical + map->stripe_size - 1,
+				   bits | EXTENT_NOWAIT,
+				   NULL, NULL);
+	}
+}
+
+void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map)
+{
+	write_lock(&fs_info->mapping_tree_lock);
+	rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
+	RB_CLEAR_NODE(&map->rb_node);
+	chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
+	write_unlock(&fs_info->mapping_tree_lock);
+
+	/* Once for the tree reference. */
+	btrfs_free_chunk_map(map);
+}
+
+EXPORT_FOR_TESTS
+int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map)
+{
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	bool leftmost = true;
+
+	write_lock(&fs_info->mapping_tree_lock);
+	p = &fs_info->mapping_tree.rb_root.rb_node;
+	while (*p) {
+		struct btrfs_chunk_map *entry;
+
+		parent = *p;
+		entry = rb_entry(parent, struct btrfs_chunk_map, rb_node);
+
+		if (map->start < entry->start) {
+			p = &(*p)->rb_left;
+		} else if (map->start > entry->start) {
+			p = &(*p)->rb_right;
+			leftmost = false;
+		} else {
+			write_unlock(&fs_info->mapping_tree_lock);
+			return -EEXIST;
+		}
+	}
+	rb_link_node(&map->rb_node, parent, p);
+	rb_insert_color_cached(&map->rb_node, &fs_info->mapping_tree, leftmost);
+	chunk_map_device_set_bits(map, CHUNK_ALLOCATED);
+	chunk_map_device_clear_bits(map, CHUNK_TRIMMED);
+	write_unlock(&fs_info->mapping_tree_lock);
+
+	return 0;
+}
+
+EXPORT_FOR_TESTS
+struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp)
+{
+	struct btrfs_chunk_map *map;
+
+	map = kmalloc(btrfs_chunk_map_size(num_stripes), gfp);
+	if (!map)
+		return NULL;
+
+	refcount_set(&map->refs, 1);
+	RB_CLEAR_NODE(&map->rb_node);
+
+	return map;
+}
+
+struct btrfs_chunk_map *btrfs_clone_chunk_map(struct btrfs_chunk_map *map, gfp_t gfp)
+{
+	const int size = btrfs_chunk_map_size(map->num_stripes);
+	struct btrfs_chunk_map *clone;
+
+	clone = kmemdup(map, size, gfp);
+	if (!clone)
+		return NULL;
+
+	refcount_set(&clone->refs, 1);
+	RB_CLEAR_NODE(&clone->rb_node);
+
+	return clone;
+}
+
 static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
 			struct alloc_chunk_ctl *ctl,
 			struct btrfs_device_info *devices_info)
 {
 	struct btrfs_fs_info *info = trans->fs_info;
-	struct map_lookup *map = NULL;
-	struct extent_map_tree *em_tree;
+	struct btrfs_chunk_map *map;
 	struct btrfs_block_group *block_group;
-	struct extent_map *em;
 	u64 start = ctl->start;
 	u64 type = ctl->type;
 	int ret;
 	int i;
 	int j;
 
-	map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
+	map = btrfs_alloc_chunk_map(ctl->num_stripes, GFP_NOFS);
 	if (!map)
 		return ERR_PTR(-ENOMEM);
+
+	map->start = start;
+	map->chunk_len = ctl->chunk_size;
+	map->stripe_size = ctl->stripe_size;
+	map->type = type;
+	map->io_align = BTRFS_STRIPE_LEN;
+	map->io_width = BTRFS_STRIPE_LEN;
+	map->sub_stripes = ctl->sub_stripes;
 	map->num_stripes = ctl->num_stripes;
 
 	for (i = 0; i < ctl->ndevs; ++i) {
@@ -5375,41 +5564,22 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
 						   j * ctl->stripe_size;
 		}
 	}
-	map->io_align = BTRFS_STRIPE_LEN;
-	map->io_width = BTRFS_STRIPE_LEN;
-	map->type = type;
-	map->sub_stripes = ctl->sub_stripes;
 
 	trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
 
-	em = alloc_extent_map();
-	if (!em) {
-		kfree(map);
-		return ERR_PTR(-ENOMEM);
-	}
-	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
-	em->map_lookup = map;
-	em->start = start;
-	em->len = ctl->chunk_size;
-	em->block_start = 0;
-	em->block_len = em->len;
-	em->orig_block_len = ctl->stripe_size;
-
-	em_tree = &info->mapping_tree;
-	write_lock(&em_tree->lock);
-	ret = add_extent_mapping(em_tree, em, 0);
+	ret = btrfs_add_chunk_map(info, map);
 	if (ret) {
-		write_unlock(&em_tree->lock);
-		free_extent_map(em);
+		btrfs_free_chunk_map(map);
 		return ERR_PTR(ret);
 	}
-	write_unlock(&em_tree->lock);
 
 	block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size);
-	if (IS_ERR(block_group))
-		goto error_del_extent;
+	if (IS_ERR(block_group)) {
+		btrfs_remove_chunk_map(info, map);
+		return block_group;
+	}
 
-	for (i = 0; i < map->num_stripes; i++) {
+	for (int i = 0; i < map->num_stripes; i++) {
 		struct btrfs_device *dev = map->stripes[i].dev;
 
 		btrfs_device_set_bytes_used(dev,
@@ -5422,23 +5592,10 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
 	atomic64_sub(ctl->stripe_size * map->num_stripes,
 		     &info->free_chunk_space);
 
-	free_extent_map(em);
 	check_raid56_incompat_flag(info, type);
 	check_raid1c34_incompat_flag(info, type);
 
 	return block_group;
-
-error_del_extent:
-	write_lock(&em_tree->lock);
-	remove_extent_mapping(em_tree, em);
-	write_unlock(&em_tree->lock);
-
-	/* One for our allocation */
-	free_extent_map(em);
-	/* One for the tree reference */
-	free_extent_map(em);
-
-	return block_group;
 }
 
 struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
@@ -5514,8 +5671,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
 	struct btrfs_key key;
 	struct btrfs_chunk *chunk;
 	struct btrfs_stripe *stripe;
-	struct extent_map *em;
-	struct map_lookup *map;
+	struct btrfs_chunk_map *map;
 	size_t item_size;
 	int i;
 	int ret;
@@ -5544,14 +5700,13 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
 	 */
 	lockdep_assert_held(&fs_info->chunk_mutex);
 
-	em = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
-	if (IS_ERR(em)) {
-		ret = PTR_ERR(em);
+	map = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
+	if (IS_ERR(map)) {
+		ret = PTR_ERR(map);
 		btrfs_abort_transaction(trans, ret);
 		return ret;
 	}
 
-	map = em->map_lookup;
 	item_size = btrfs_chunk_item_size(map->num_stripes);
 
 	chunk = kzalloc(item_size, GFP_NOFS);
@@ -5608,7 +5763,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
 
 out:
 	kfree(chunk);
-	free_extent_map(em);
+	btrfs_free_chunk_map(map);
 	return ret;
 }
 
@@ -5653,7 +5808,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
 	return 0;
 }
 
-static inline int btrfs_chunk_max_errors(struct map_lookup *map)
+static inline int btrfs_chunk_max_errors(struct btrfs_chunk_map *map)
 {
 	const int index = btrfs_bg_flags_to_raid_index(map->type);
 
@@ -5662,17 +5817,15 @@ static inline int btrfs_chunk_max_errors(struct map_lookup *map)
 
 bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
 {
-	struct extent_map *em;
-	struct map_lookup *map;
+	struct btrfs_chunk_map *map;
 	int miss_ndevs = 0;
 	int i;
 	bool ret = true;
 
-	em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
-	if (IS_ERR(em))
+	map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
+	if (IS_ERR(map))
 		return false;
 
-	map = em->map_lookup;
 	for (i = 0; i < map->num_stripes; i++) {
 		if (test_bit(BTRFS_DEV_STATE_MISSING,
 					&map->stripes[i].dev->dev_state)) {
@@ -5693,38 +5846,37 @@ bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
 	if (miss_ndevs > btrfs_chunk_max_errors(map))
 		ret = false;
 end:
-	free_extent_map(em);
+	btrfs_free_chunk_map(map);
 	return ret;
 }
 
-void btrfs_mapping_tree_free(struct extent_map_tree *tree)
+void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info)
 {
-	struct extent_map *em;
+	write_lock(&fs_info->mapping_tree_lock);
+	while (!RB_EMPTY_ROOT(&fs_info->mapping_tree.rb_root)) {
+		struct btrfs_chunk_map *map;
+		struct rb_node *node;
 
-	while (1) {
-		write_lock(&tree->lock);
-		em = lookup_extent_mapping(tree, 0, (u64)-1);
-		if (em)
-			remove_extent_mapping(tree, em);
-		write_unlock(&tree->lock);
-		if (!em)
-			break;
-		/* once for us */
-		free_extent_map(em);
-		/* once for the tree */
-		free_extent_map(em);
+		node = rb_first_cached(&fs_info->mapping_tree);
+		map = rb_entry(node, struct btrfs_chunk_map, rb_node);
+		rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
+		RB_CLEAR_NODE(&map->rb_node);
+		chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
+		/* Once for the tree ref. */
+		btrfs_free_chunk_map(map);
+		cond_resched_rwlock_write(&fs_info->mapping_tree_lock);
 	}
+	write_unlock(&fs_info->mapping_tree_lock);
 }
 
 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 {
-	struct extent_map *em;
-	struct map_lookup *map;
+	struct btrfs_chunk_map *map;
 	enum btrfs_raid_types index;
 	int ret = 1;
 
-	em = btrfs_get_chunk_map(fs_info, logical, len);
-	if (IS_ERR(em))
+	map = btrfs_get_chunk_map(fs_info, logical, len);
+	if (IS_ERR(map))
 		/*
 		 * We could return errors for these cases, but that could get
 		 * ugly and we'd probably do the same thing which is just not do
@@ -5733,7 +5885,6 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 		 */
 		return 1;
 
-	map = em->map_lookup;
 	index = btrfs_bg_flags_to_raid_index(map->type);
 
 	/* Non-RAID56, use their ncopies from btrfs_raid_array. */
@@ -5750,53 +5901,49 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 		 * stripe under reconstruction.
 		 */
 		ret = map->num_stripes;
-	free_extent_map(em);
+	btrfs_free_chunk_map(map);
 	return ret;
 }
 
 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
 				    u64 logical)
 {
-	struct extent_map *em;
-	struct map_lookup *map;
+	struct btrfs_chunk_map *map;
 	unsigned long len = fs_info->sectorsize;
 
 	if (!btrfs_fs_incompat(fs_info, RAID56))
 		return len;
 
-	em = btrfs_get_chunk_map(fs_info, logical, len);
+	map = btrfs_get_chunk_map(fs_info, logical, len);
 
-	if (!WARN_ON(IS_ERR(em))) {
-		map = em->map_lookup;
+	if (!WARN_ON(IS_ERR(map))) {
 		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
 			len = btrfs_stripe_nr_to_offset(nr_data_stripes(map));
-		free_extent_map(em);
+		btrfs_free_chunk_map(map);
 	}
 	return len;
 }
 
 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 {
-	struct extent_map *em;
-	struct map_lookup *map;
+	struct btrfs_chunk_map *map;
 	int ret = 0;
 
 	if (!btrfs_fs_incompat(fs_info, RAID56))
 		return 0;
 
-	em = btrfs_get_chunk_map(fs_info, logical, len);
+	map = btrfs_get_chunk_map(fs_info, logical, len);
 
-	if(!WARN_ON(IS_ERR(em))) {
-		map = em->map_lookup;
+	if (!WARN_ON(IS_ERR(map))) {
 		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
 			ret = 1;
-		free_extent_map(em);
+		btrfs_free_chunk_map(map);
 	}
 	return ret;
 }
 
 static int find_live_mirror(struct btrfs_fs_info *fs_info,
-			    struct map_lookup *map, int first,
+			    struct btrfs_chunk_map *map, int first,
 			    int dev_replace_is_ongoing)
 {
 	int i;
@@ -5903,8 +6050,7 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
 					       u64 logical, u64 *length_ret,
 					       u32 *num_stripes)
 {
-	struct extent_map *em;
-	struct map_lookup *map;
+	struct btrfs_chunk_map *map;
 	struct btrfs_discard_stripe *stripes;
 	u64 length = *length_ret;
 	u64 offset;
@@ -5922,11 +6068,9 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
 	int ret;
 	int i;
 
-	em = btrfs_get_chunk_map(fs_info, logical, length);
-	if (IS_ERR(em))
-		return ERR_CAST(em);
-
-	map = em->map_lookup;
+	map = btrfs_get_chunk_map(fs_info, logical, length);
+	if (IS_ERR(map))
+		return ERR_CAST(map);
 
 	/* we don't discard raid56 yet */
 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
@@ -5934,8 +6078,8 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
 		goto out_free_map;
 	}
 
-	offset = logical - em->start;
-	length = min_t(u64, em->start + em->len - logical, length);
+	offset = logical - map->start;
+	length = min_t(u64, map->start + map->chunk_len - logical, length);
 	*length_ret = length;
 
 	/*
@@ -6032,10 +6176,10 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
 		}
 	}
 
-	free_extent_map(em);
+	btrfs_free_chunk_map(map);
 	return stripes;
 out_free_map:
-	free_extent_map(em);
+	btrfs_free_chunk_map(map);
 	return ERR_PTR(ret);
 }
 
@@ -6133,17 +6277,16 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
 	bioc->replace_nr_stripes = nr_extra_stripes;
 }
 
-static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
-			    u64 offset, u32 *stripe_nr, u64 *stripe_offset,
-			    u64 *full_stripe_start)
+static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, u64 offset,
+			    struct btrfs_io_geometry *io_geom)
 {
 	/*
 	 * Stripe_nr is the stripe where this block falls.  stripe_offset is
 	 * the offset of this block in its stripe.
 	 */
-	*stripe_offset = offset & BTRFS_STRIPE_LEN_MASK;
-	*stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
-	ASSERT(*stripe_offset < U32_MAX);
+	io_geom->stripe_offset = offset & BTRFS_STRIPE_LEN_MASK;
+	io_geom->stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
+	ASSERT(io_geom->stripe_offset < U32_MAX);
 
 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
 		unsigned long full_stripe_len =
@@ -6158,18 +6301,17 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
 		 * to go rounddown(), not round_down(), as nr_data_stripes is
 		 * not ensured to be power of 2.
 		 */
-		*full_stripe_start =
-			btrfs_stripe_nr_to_offset(
-				rounddown(*stripe_nr, nr_data_stripes(map)));
+		io_geom->raid56_full_stripe_start = btrfs_stripe_nr_to_offset(
+			rounddown(io_geom->stripe_nr, nr_data_stripes(map)));
 
-		ASSERT(*full_stripe_start + full_stripe_len > offset);
-		ASSERT(*full_stripe_start <= offset);
+		ASSERT(io_geom->raid56_full_stripe_start + full_stripe_len > offset);
+		ASSERT(io_geom->raid56_full_stripe_start <= offset);
 		/*
 		 * For writes to RAID56, allow to write a full stripe set, but
 		 * no straddling of stripe sets.
 		 */
-		if (op == BTRFS_MAP_WRITE)
-			return full_stripe_len - (offset - *full_stripe_start);
+		if (io_geom->op == BTRFS_MAP_WRITE)
+			return full_stripe_len - (offset - io_geom->raid56_full_stripe_start);
 	}
 
 	/*
@@ -6177,26 +6319,180 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
 	 * a single disk).
 	 */
 	if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK)
-		return BTRFS_STRIPE_LEN - *stripe_offset;
+		return BTRFS_STRIPE_LEN - io_geom->stripe_offset;
 	return U64_MAX;
 }
 
-static int set_io_stripe(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
-			 u64 logical, u64 *length, struct btrfs_io_stripe *dst,
-			 struct map_lookup *map, u32 stripe_index,
-			 u64 stripe_offset, u64 stripe_nr)
+static int set_io_stripe(struct btrfs_fs_info *fs_info, u64 logical,
+			 u64 *length, struct btrfs_io_stripe *dst,
+			 struct btrfs_chunk_map *map,
+			 struct btrfs_io_geometry *io_geom)
 {
-	dst->dev = map->stripes[stripe_index].dev;
+	dst->dev = map->stripes[io_geom->stripe_index].dev;
 
-	if (op == BTRFS_MAP_READ && btrfs_need_stripe_tree_update(fs_info, map->type))
+	if (io_geom->op == BTRFS_MAP_READ &&
+	    btrfs_need_stripe_tree_update(fs_info, map->type))
 		return btrfs_get_raid_extent_offset(fs_info, logical, length,
-						    map->type, stripe_index, dst);
+						    map->type,
+						    io_geom->stripe_index, dst);
 
-	dst->physical = map->stripes[stripe_index].physical +
-			stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr);
+	dst->physical = map->stripes[io_geom->stripe_index].physical +
+			io_geom->stripe_offset +
+			btrfs_stripe_nr_to_offset(io_geom->stripe_nr);
 	return 0;
 }
 
+static bool is_single_device_io(struct btrfs_fs_info *fs_info,
+				const struct btrfs_io_stripe *smap,
+				const struct btrfs_chunk_map *map,
+				int num_alloc_stripes,
+				enum btrfs_map_op op, int mirror_num)
+{
+	if (!smap)
+		return false;
+
+	if (num_alloc_stripes != 1)
+		return false;
+
+	if (btrfs_need_stripe_tree_update(fs_info, map->type) && op != BTRFS_MAP_READ)
+		return false;
+
+	if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)
+		return false;
+
+	return true;
+}
+
+static void map_blocks_raid0(const struct btrfs_chunk_map *map,
+			     struct btrfs_io_geometry *io_geom)
+{
+	io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes;
+	io_geom->stripe_nr /= map->num_stripes;
+	if (io_geom->op == BTRFS_MAP_READ)
+		io_geom->mirror_num = 1;
+}
+
+static void map_blocks_raid1(struct btrfs_fs_info *fs_info,
+			     struct btrfs_chunk_map *map,
+			     struct btrfs_io_geometry *io_geom,
+			     bool dev_replace_is_ongoing)
+{
+	if (io_geom->op != BTRFS_MAP_READ) {
+		io_geom->num_stripes = map->num_stripes;
+		return;
+	}
+
+	if (io_geom->mirror_num) {
+		io_geom->stripe_index = io_geom->mirror_num - 1;
+		return;
+	}
+
+	io_geom->stripe_index = find_live_mirror(fs_info, map, 0,
+						 dev_replace_is_ongoing);
+	io_geom->mirror_num = io_geom->stripe_index + 1;
+}
+
+static void map_blocks_dup(const struct btrfs_chunk_map *map,
+			   struct btrfs_io_geometry *io_geom)
+{
+	if (io_geom->op != BTRFS_MAP_READ) {
+		io_geom->num_stripes = map->num_stripes;
+		return;
+	}
+
+	if (io_geom->mirror_num) {
+		io_geom->stripe_index = io_geom->mirror_num - 1;
+		return;
+	}
+
+	io_geom->mirror_num = 1;
+}
+
+static void map_blocks_raid10(struct btrfs_fs_info *fs_info,
+			      struct btrfs_chunk_map *map,
+			      struct btrfs_io_geometry *io_geom,
+			      bool dev_replace_is_ongoing)
+{
+	u32 factor = map->num_stripes / map->sub_stripes;
+	int old_stripe_index;
+
+	io_geom->stripe_index = (io_geom->stripe_nr % factor) * map->sub_stripes;
+	io_geom->stripe_nr /= factor;
+
+	if (io_geom->op != BTRFS_MAP_READ) {
+		io_geom->num_stripes = map->sub_stripes;
+		return;
+	}
+
+	if (io_geom->mirror_num) {
+		io_geom->stripe_index += io_geom->mirror_num - 1;
+		return;
+	}
+
+	old_stripe_index = io_geom->stripe_index;
+	io_geom->stripe_index = find_live_mirror(fs_info, map,
+						 io_geom->stripe_index,
+						 dev_replace_is_ongoing);
+	io_geom->mirror_num = io_geom->stripe_index - old_stripe_index + 1;
+}
+
+static void map_blocks_raid56_write(struct btrfs_chunk_map *map,
+				    struct btrfs_io_geometry *io_geom,
+				    u64 logical, u64 *length)
+{
+	int data_stripes = nr_data_stripes(map);
+
+	/*
+	 * Needs full stripe mapping.
+	 *
+	 * Push stripe_nr back to the start of the full stripe For those cases
+	 * needing a full stripe, @stripe_nr is the full stripe number.
+	 *
+	 * Originally we go raid56_full_stripe_start / full_stripe_len, but
+	 * that can be expensive.  Here we just divide @stripe_nr with
+	 * @data_stripes.
+	 */
+	io_geom->stripe_nr /= data_stripes;
+
+	/* RAID[56] write or recovery. Return all stripes */
+	io_geom->num_stripes = map->num_stripes;
+	io_geom->max_errors = btrfs_chunk_max_errors(map);
+
+	/* Return the length to the full stripe end. */
+	*length = min(logical + *length,
+		      io_geom->raid56_full_stripe_start + map->start +
+		      btrfs_stripe_nr_to_offset(data_stripes)) -
+		logical;
+	io_geom->stripe_index = 0;
+	io_geom->stripe_offset = 0;
+}
+
+static void map_blocks_raid56_read(struct btrfs_chunk_map *map,
+				   struct btrfs_io_geometry *io_geom)
+{
+	int data_stripes = nr_data_stripes(map);
+
+	ASSERT(io_geom->mirror_num <= 1);
+	/* Just grab the data stripe directly. */
+	io_geom->stripe_index = io_geom->stripe_nr % data_stripes;
+	io_geom->stripe_nr /= data_stripes;
+
+	/* We distribute the parity blocks across stripes. */
+	io_geom->stripe_index =
+		(io_geom->stripe_nr + io_geom->stripe_index) % map->num_stripes;
+
+	if (io_geom->op == BTRFS_MAP_READ && io_geom->mirror_num < 1)
+		io_geom->mirror_num = 1;
+}
+
+static void map_blocks_single(const struct btrfs_chunk_map *map,
+			      struct btrfs_io_geometry *io_geom)
+{
+	io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes;
+	io_geom->stripe_nr /= map->num_stripes;
+	io_geom->mirror_num = io_geom->stripe_index + 1;
+}
+
 /*
  * Map one logical range to one or more physical ranges.
  *
@@ -6237,43 +6533,37 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 		    struct btrfs_io_context **bioc_ret,
 		    struct btrfs_io_stripe *smap, int *mirror_num_ret)
 {
-	struct extent_map *em;
-	struct map_lookup *map;
+	struct btrfs_chunk_map *map;
+	struct btrfs_io_geometry io_geom = { 0 };
 	u64 map_offset;
-	u64 stripe_offset;
-	u32 stripe_nr;
-	u32 stripe_index;
-	int data_stripes;
 	int i;
 	int ret = 0;
-	int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
-	int num_stripes;
 	int num_copies;
-	int max_errors = 0;
 	struct btrfs_io_context *bioc = NULL;
 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 	int dev_replace_is_ongoing = 0;
 	u16 num_alloc_stripes;
-	u64 raid56_full_stripe_start = (u64)-1;
 	u64 max_len;
 
 	ASSERT(bioc_ret);
 
+	io_geom.mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
+	io_geom.num_stripes = 1;
+	io_geom.stripe_index = 0;
+	io_geom.op = op;
+
 	num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize);
-	if (mirror_num > num_copies)
+	if (io_geom.mirror_num > num_copies)
 		return -EINVAL;
 
-	em = btrfs_get_chunk_map(fs_info, logical, *length);
-	if (IS_ERR(em))
-		return PTR_ERR(em);
+	map = btrfs_get_chunk_map(fs_info, logical, *length);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
 
-	map = em->map_lookup;
-	data_stripes = nr_data_stripes(map);
-
-	map_offset = logical - em->start;
-	max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr,
-				   &stripe_offset, &raid56_full_stripe_start);
-	*length = min_t(u64, em->len - map_offset, max_len);
+	map_offset = logical - map->start;
+	io_geom.raid56_full_stripe_start = (u64)-1;
+	max_len = btrfs_max_io_len(map, map_offset, &io_geom);
+	*length = min_t(u64, map->chunk_len - map_offset, max_len);
 
 	down_read(&dev_replace->rwsem);
 	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
@@ -6284,107 +6574,46 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	if (!dev_replace_is_ongoing)
 		up_read(&dev_replace->rwsem);
 
-	num_stripes = 1;
-	stripe_index = 0;
-	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
-		stripe_index = stripe_nr % map->num_stripes;
-		stripe_nr /= map->num_stripes;
-		if (op == BTRFS_MAP_READ)
-			mirror_num = 1;
-	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
-		if (op != BTRFS_MAP_READ) {
-			num_stripes = map->num_stripes;
-		} else if (mirror_num) {
-			stripe_index = mirror_num - 1;
-		} else {
-			stripe_index = find_live_mirror(fs_info, map, 0,
-					    dev_replace_is_ongoing);
-			mirror_num = stripe_index + 1;
-		}
-
-	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
-		if (op != BTRFS_MAP_READ) {
-			num_stripes = map->num_stripes;
-		} else if (mirror_num) {
-			stripe_index = mirror_num - 1;
-		} else {
-			mirror_num = 1;
-		}
-
-	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
-		u32 factor = map->num_stripes / map->sub_stripes;
-
-		stripe_index = (stripe_nr % factor) * map->sub_stripes;
-		stripe_nr /= factor;
-
-		if (op != BTRFS_MAP_READ)
-			num_stripes = map->sub_stripes;
-		else if (mirror_num)
-			stripe_index += mirror_num - 1;
-		else {
-			int old_stripe_index = stripe_index;
-			stripe_index = find_live_mirror(fs_info, map,
-					      stripe_index,
-					      dev_replace_is_ongoing);
-			mirror_num = stripe_index - old_stripe_index + 1;
-		}
-
-	} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
-		if (op != BTRFS_MAP_READ || mirror_num > 1) {
-			/*
-			 * Needs full stripe mapping.
-			 *
-			 * Push stripe_nr back to the start of the full stripe
-			 * For those cases needing a full stripe, @stripe_nr
-			 * is the full stripe number.
-			 *
-			 * Originally we go raid56_full_stripe_start / full_stripe_len,
-			 * but that can be expensive.  Here we just divide
-			 * @stripe_nr with @data_stripes.
-			 */
-			stripe_nr /= data_stripes;
-
-			/* RAID[56] write or recovery. Return all stripes */
-			num_stripes = map->num_stripes;
-			max_errors = btrfs_chunk_max_errors(map);
-
-			/* Return the length to the full stripe end */
-			*length = min(logical + *length,
-				      raid56_full_stripe_start + em->start +
-				      btrfs_stripe_nr_to_offset(data_stripes)) -
-				  logical;
-			stripe_index = 0;
-			stripe_offset = 0;
-		} else {
-			ASSERT(mirror_num <= 1);
-			/* Just grab the data stripe directly. */
-			stripe_index = stripe_nr % data_stripes;
-			stripe_nr /= data_stripes;
-
-			/* We distribute the parity blocks across stripes */
-			stripe_index = (stripe_nr + stripe_index) % map->num_stripes;
-			if (op == BTRFS_MAP_READ && mirror_num < 1)
-				mirror_num = 1;
-		}
-	} else {
+	switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+	case BTRFS_BLOCK_GROUP_RAID0:
+		map_blocks_raid0(map, &io_geom);
+		break;
+	case BTRFS_BLOCK_GROUP_RAID1:
+	case BTRFS_BLOCK_GROUP_RAID1C3:
+	case BTRFS_BLOCK_GROUP_RAID1C4:
+		map_blocks_raid1(fs_info, map, &io_geom, dev_replace_is_ongoing);
+		break;
+	case BTRFS_BLOCK_GROUP_DUP:
+		map_blocks_dup(map, &io_geom);
+		break;
+	case BTRFS_BLOCK_GROUP_RAID10:
+		map_blocks_raid10(fs_info, map, &io_geom, dev_replace_is_ongoing);
+		break;
+	case BTRFS_BLOCK_GROUP_RAID5:
+	case BTRFS_BLOCK_GROUP_RAID6:
+		if (op != BTRFS_MAP_READ || io_geom.mirror_num > 1)
+			map_blocks_raid56_write(map, &io_geom, logical, length);
+		else
+			map_blocks_raid56_read(map, &io_geom);
+		break;
+	default:
 		/*
 		 * After this, stripe_nr is the number of stripes on this
 		 * device we have to walk to find the data, and stripe_index is
 		 * the number of our device in the stripe array
 		 */
-		stripe_index = stripe_nr % map->num_stripes;
-		stripe_nr /= map->num_stripes;
-		mirror_num = stripe_index + 1;
+		map_blocks_single(map, &io_geom);
+		break;
 	}
-	if (stripe_index >= map->num_stripes) {
+	if (io_geom.stripe_index >= map->num_stripes) {
 		btrfs_crit(fs_info,
 			   "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
-			   stripe_index, map->num_stripes);
+			   io_geom.stripe_index, map->num_stripes);
 		ret = -EINVAL;
 		goto out;
 	}
 
-	num_alloc_stripes = num_stripes;
+	num_alloc_stripes = io_geom.num_stripes;
 	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
 	    op != BTRFS_MAP_READ)
 		/*
@@ -6401,14 +6630,11 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	 * physical block information on the stack instead of allocating an
 	 * I/O context structure.
 	 */
-	if (smap && num_alloc_stripes == 1 &&
-	    !(btrfs_need_stripe_tree_update(fs_info, map->type) &&
-	      op != BTRFS_MAP_READ) &&
-	    !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)) {
-		ret = set_io_stripe(fs_info, op, logical, length, smap, map,
-				    stripe_index, stripe_offset, stripe_nr);
+	if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, op,
+				io_geom.mirror_num)) {
+		ret = set_io_stripe(fs_info, logical, length, smap, map, &io_geom);
 		if (mirror_num_ret)
-			*mirror_num_ret = mirror_num;
+			*mirror_num_ret = io_geom.mirror_num;
 		*bioc_ret = NULL;
 		goto out;
 	}
@@ -6428,7 +6654,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	 * It's still mostly the same as other profiles, just with extra rotation.
 	 */
 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
-	    (op != BTRFS_MAP_READ || mirror_num > 1)) {
+	    (op != BTRFS_MAP_READ || io_geom.mirror_num > 1)) {
 		/*
 		 * For RAID56 @stripe_nr is already the number of full stripes
 		 * before us, which is also the rotation value (needs to modulo
@@ -6437,28 +6663,31 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 		 * In this case, we just add @stripe_nr with @i, then do the
 		 * modulo, to reduce one modulo call.
 		 */
-		bioc->full_stripe_logical = em->start +
-			btrfs_stripe_nr_to_offset(stripe_nr * data_stripes);
-		for (int i = 0; i < num_stripes; i++) {
-			ret = set_io_stripe(fs_info, op, logical, length,
-					    &bioc->stripes[i], map,
-					    (i + stripe_nr) % num_stripes,
-					    stripe_offset, stripe_nr);
-			if (ret < 0)
-				break;
+		bioc->full_stripe_logical = map->start +
+			btrfs_stripe_nr_to_offset(io_geom.stripe_nr *
+						  nr_data_stripes(map));
+		for (int i = 0; i < io_geom.num_stripes; i++) {
+			struct btrfs_io_stripe *dst = &bioc->stripes[i];
+			u32 stripe_index;
+
+			stripe_index = (i + io_geom.stripe_nr) % io_geom.num_stripes;
+			dst->dev = map->stripes[stripe_index].dev;
+			dst->physical =
+				map->stripes[stripe_index].physical +
+				io_geom.stripe_offset +
+				btrfs_stripe_nr_to_offset(io_geom.stripe_nr);
 		}
 	} else {
 		/*
 		 * For all other non-RAID56 profiles, just copy the target
 		 * stripe into the bioc.
 		 */
-		for (i = 0; i < num_stripes; i++) {
-			ret = set_io_stripe(fs_info, op, logical, length,
-					    &bioc->stripes[i], map, stripe_index,
-					    stripe_offset, stripe_nr);
+		for (i = 0; i < io_geom.num_stripes; i++) {
+			ret = set_io_stripe(fs_info, logical, length,
+					    &bioc->stripes[i], map, &io_geom);
 			if (ret < 0)
 				break;
-			stripe_index++;
+			io_geom.stripe_index++;
 		}
 	}
 
@@ -6469,18 +6698,18 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	}
 
 	if (op != BTRFS_MAP_READ)
-		max_errors = btrfs_chunk_max_errors(map);
+		io_geom.max_errors = btrfs_chunk_max_errors(map);
 
 	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
 	    op != BTRFS_MAP_READ) {
 		handle_ops_on_dev_replace(op, bioc, dev_replace, logical,
-					  &num_stripes, &max_errors);
+					  &io_geom.num_stripes, &io_geom.max_errors);
 	}
 
 	*bioc_ret = bioc;
-	bioc->num_stripes = num_stripes;
-	bioc->max_errors = max_errors;
-	bioc->mirror_num = mirror_num;
+	bioc->num_stripes = io_geom.num_stripes;
+	bioc->max_errors = io_geom.max_errors;
+	bioc->mirror_num = io_geom.mirror_num;
 
 out:
 	if (dev_replace_is_ongoing) {
@@ -6488,7 +6717,7 @@ out:
 		/* Unlock and let waiting writers proceed */
 		up_read(&dev_replace->rwsem);
 	}
-	free_extent_map(em);
+	btrfs_free_chunk_map(map);
 	return ret;
 }
 
@@ -6660,12 +6889,11 @@ static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
 			      devid, uuid);
 }
 
-u64 btrfs_calc_stripe_length(const struct extent_map *em)
+u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map)
 {
-	const struct map_lookup *map = em->map_lookup;
 	const int data_stripes = calc_data_stripes(map->type, map->num_stripes);
 
-	return div_u64(em->len, data_stripes);
+	return div_u64(map->chunk_len, data_stripes);
 }
 
 #if BITS_PER_LONG == 32
@@ -6734,9 +6962,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
 {
 	BTRFS_DEV_LOOKUP_ARGS(args);
 	struct btrfs_fs_info *fs_info = leaf->fs_info;
-	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
-	struct map_lookup *map;
-	struct extent_map *em;
+	struct btrfs_chunk_map *map;
 	u64 logical;
 	u64 length;
 	u64 devid;
@@ -6770,35 +6996,22 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
 			return ret;
 	}
 
-	read_lock(&map_tree->lock);
-	em = lookup_extent_mapping(map_tree, logical, 1);
-	read_unlock(&map_tree->lock);
+	map = btrfs_find_chunk_map(fs_info, logical, 1);
 
 	/* already mapped? */
-	if (em && em->start <= logical && em->start + em->len > logical) {
-		free_extent_map(em);
+	if (map && map->start <= logical && map->start + map->chunk_len > logical) {
+		btrfs_free_chunk_map(map);
 		return 0;
-	} else if (em) {
-		free_extent_map(em);
+	} else if (map) {
+		btrfs_free_chunk_map(map);
 	}
 
-	em = alloc_extent_map();
-	if (!em)
-		return -ENOMEM;
-	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
-	if (!map) {
-		free_extent_map(em);
+	map = btrfs_alloc_chunk_map(num_stripes, GFP_NOFS);
+	if (!map)
 		return -ENOMEM;
-	}
-
-	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
-	em->map_lookup = map;
-	em->start = logical;
-	em->len = length;
-	em->orig_start = 0;
-	em->block_start = 0;
-	em->block_len = em->len;
 
+	map->start = logical;
+	map->chunk_len = length;
 	map->num_stripes = num_stripes;
 	map->io_width = btrfs_chunk_io_width(leaf, chunk);
 	map->io_align = btrfs_chunk_io_align(leaf, chunk);
@@ -6813,7 +7026,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
 	 */
 	map->sub_stripes = btrfs_raid_array[index].sub_stripes;
 	map->verified_stripes = 0;
-	em->orig_block_len = btrfs_calc_stripe_length(em);
+	map->stripe_size = btrfs_calc_stripe_length(map);
 	for (i = 0; i < num_stripes; i++) {
 		map->stripes[i].physical =
 			btrfs_stripe_offset_nr(leaf, chunk, i);
@@ -6829,7 +7042,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
 								    devid, uuid);
 			if (IS_ERR(map->stripes[i].dev)) {
 				ret = PTR_ERR(map->stripes[i].dev);
-				free_extent_map(em);
+				btrfs_free_chunk_map(map);
 				return ret;
 			}
 		}
@@ -6838,15 +7051,12 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
 				&(map->stripes[i].dev->dev_state));
 	}
 
-	write_lock(&map_tree->lock);
-	ret = add_extent_mapping(map_tree, em, 0);
-	write_unlock(&map_tree->lock);
+	ret = btrfs_add_chunk_map(fs_info, map);
 	if (ret < 0) {
 		btrfs_err(fs_info,
 			  "failed to add chunk map, start=%llu len=%llu: %d",
-			  em->start, em->len, ret);
+			  map->start, map->chunk_len, ret);
 	}
-	free_extent_map(em);
 
 	return ret;
 }
@@ -7156,26 +7366,21 @@ out_short_read:
 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
 					struct btrfs_device *failing_dev)
 {
-	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
-	struct extent_map *em;
-	u64 next_start = 0;
+	struct btrfs_chunk_map *map;
+	u64 next_start;
 	bool ret = true;
 
-	read_lock(&map_tree->lock);
-	em = lookup_extent_mapping(map_tree, 0, (u64)-1);
-	read_unlock(&map_tree->lock);
+	map = btrfs_find_chunk_map(fs_info, 0, U64_MAX);
 	/* No chunk at all? Return false anyway */
-	if (!em) {
+	if (!map) {
 		ret = false;
 		goto out;
 	}
-	while (em) {
-		struct map_lookup *map;
+	while (map) {
 		int missing = 0;
 		int max_tolerated;
 		int i;
 
-		map = em->map_lookup;
 		max_tolerated =
 			btrfs_get_num_tolerated_disk_barrier_failures(
 					map->type);
@@ -7193,18 +7398,15 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
 			if (!failing_dev)
 				btrfs_warn(fs_info,
 	"chunk %llu missing %d devices, max tolerance is %d for writable mount",
-				   em->start, missing, max_tolerated);
-			free_extent_map(em);
+				   map->start, missing, max_tolerated);
+			btrfs_free_chunk_map(map);
 			ret = false;
 			goto out;
 		}
-		next_start = extent_map_end(em);
-		free_extent_map(em);
+		next_start = map->start + map->chunk_len;
+		btrfs_free_chunk_map(map);
 
-		read_lock(&map_tree->lock);
-		em = lookup_extent_mapping(map_tree, next_start,
-					   (u64)(-1) - next_start);
-		read_unlock(&map_tree->lock);
+		map = btrfs_find_chunk_map(fs_info, next_start, U64_MAX - next_start);
 	}
 out:
 	return ret;
@@ -7697,20 +7899,15 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
 				 u64 physical_offset, u64 physical_len)
 {
 	struct btrfs_dev_lookup_args args = { .devid = devid };
-	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
-	struct extent_map *em;
-	struct map_lookup *map;
+	struct btrfs_chunk_map *map;
 	struct btrfs_device *dev;
 	u64 stripe_len;
 	bool found = false;
 	int ret = 0;
 	int i;
 
-	read_lock(&em_tree->lock);
-	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
-	read_unlock(&em_tree->lock);
-
-	if (!em) {
+	map = btrfs_find_chunk_map(fs_info, chunk_offset, 1);
+	if (!map) {
 		btrfs_err(fs_info,
 "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
 			  physical_offset, devid);
@@ -7718,12 +7915,11 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
 		goto out;
 	}
 
-	map = em->map_lookup;
-	stripe_len = btrfs_calc_stripe_length(em);
+	stripe_len = btrfs_calc_stripe_length(map);
 	if (physical_len != stripe_len) {
 		btrfs_err(fs_info,
 "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
-			  physical_offset, devid, em->start, physical_len,
+			  physical_offset, devid, map->start, physical_len,
 			  stripe_len);
 		ret = -EUCLEAN;
 		goto out;
@@ -7746,7 +7942,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
 			if (map->verified_stripes >= map->num_stripes) {
 				btrfs_err(fs_info,
 				"too many dev extents for chunk %llu found",
-					  em->start);
+					  map->start);
 				ret = -EUCLEAN;
 				goto out;
 			}
@@ -7792,32 +7988,30 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
 	}
 
 out:
-	free_extent_map(em);
+	btrfs_free_chunk_map(map);
 	return ret;
 }
 
 static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
 {
-	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
-	struct extent_map *em;
 	struct rb_node *node;
 	int ret = 0;
 
-	read_lock(&em_tree->lock);
-	for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
-		em = rb_entry(node, struct extent_map, rb_node);
-		if (em->map_lookup->num_stripes !=
-		    em->map_lookup->verified_stripes) {
+	read_lock(&fs_info->mapping_tree_lock);
+	for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) {
+		struct btrfs_chunk_map *map;
+
+		map = rb_entry(node, struct btrfs_chunk_map, rb_node);
+		if (map->num_stripes != map->verified_stripes) {
 			btrfs_err(fs_info,
 			"chunk %llu has missing dev extent, have %d expect %d",
-				  em->start, em->map_lookup->verified_stripes,
-				  em->map_lookup->num_stripes);
+				  map->start, map->verified_stripes, map->num_stripes);
 			ret = -EUCLEAN;
 			goto out;
 		}
 	}
 out:
-	read_unlock(&em_tree->lock);
+	read_unlock(&fs_info->mapping_tree_lock);
 	return ret;
 }
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 9cc374864a79..53f87f398da7 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -426,7 +426,8 @@ struct btrfs_discard_stripe {
 struct btrfs_io_context {
 	refcount_t refs;
 	struct btrfs_fs_info *fs_info;
-	u64 map_type; /* get from map_lookup->type */
+	/* Taken from struct btrfs_chunk_map::type. */
+	u64 map_type;
 	struct bio *orig_bio;
 	atomic_t error;
 	u16 max_errors;
@@ -529,18 +530,32 @@ struct btrfs_raid_attr {
 
 extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES];
 
-struct map_lookup {
+struct btrfs_chunk_map {
+	struct rb_node rb_node;
+	/* For mount time dev extent verification. */
+	int verified_stripes;
+	refcount_t refs;
+	u64 start;
+	u64 chunk_len;
+	u64 stripe_size;
 	u64 type;
 	int io_align;
 	int io_width;
 	int num_stripes;
 	int sub_stripes;
-	int verified_stripes; /* For mount time dev extent verification */
 	struct btrfs_io_stripe stripes[];
 };
 
-#define map_lookup_size(n) (sizeof(struct map_lookup) + \
-			    (sizeof(struct btrfs_io_stripe) * (n)))
+#define btrfs_chunk_map_size(n) (sizeof(struct btrfs_chunk_map) + \
+				 (sizeof(struct btrfs_io_stripe) * (n)))
+
+static inline void btrfs_free_chunk_map(struct btrfs_chunk_map *map)
+{
+	if (map && refcount_dec_and_test(&map->refs)) {
+		ASSERT(RB_EMPTY_NODE(&map->rb_node));
+		kfree(map);
+	}
+}
 
 struct btrfs_balance_args;
 struct btrfs_balance_progress;
@@ -598,7 +613,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
 }
 
 /*
- * Do the type safe converstion from stripe_nr to offset inside the chunk.
+ * Do the type safe conversion from stripe_nr to offset inside the chunk.
  *
  * @stripe_nr is u32, with left shift it can overflow u32 for chunks larger
  * than 4G.  This does the proper type cast to avoid overflow.
@@ -624,7 +639,7 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
 struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
 					    u64 type);
-void btrfs_mapping_tree_free(struct extent_map_tree *tree);
+void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info);
 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		       blk_mode_t flags, void *holder);
 struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
@@ -680,13 +695,25 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
 			   u64 logical, u64 len);
 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
 				    u64 logical);
-u64 btrfs_calc_stripe_length(const struct extent_map *em);
+u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map);
 int btrfs_nr_parity_stripes(u64 type);
 int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
 				     struct btrfs_block_group *bg);
 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
-struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
-				       u64 logical, u64 length);
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp);
+int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map);
+#endif
+
+struct btrfs_chunk_map *btrfs_clone_chunk_map(struct btrfs_chunk_map *map, gfp_t gfp);
+struct btrfs_chunk_map *btrfs_find_chunk_map(struct btrfs_fs_info *fs_info,
+					     u64 logical, u64 length);
+struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_info,
+						    u64 logical, u64 length);
+struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
+					    u64 logical, u64 length);
+void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map);
 void btrfs_release_disk_super(struct btrfs_super_block *super);
 
 static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 3cf236fb40a4..6287763fdccc 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -382,6 +382,53 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
 	return btrfs_setxattr_trans(inode, name, buffer, size, flags);
 }
 
+static int btrfs_xattr_handler_get_security(const struct xattr_handler *handler,
+					    struct dentry *unused,
+					    struct inode *inode,
+					    const char *name, void *buffer,
+					    size_t size)
+{
+	int ret;
+	bool is_cap = false;
+
+	name = xattr_full_name(handler, name);
+
+	/*
+	 * security.capability doesn't cache the results, so calls into us
+	 * constantly to see if there's a capability xattr.  Cache the result
+	 * here in order to avoid wasting time doing lookups for xattrs we know
+	 * don't exist.
+	 */
+	if (strcmp(name, XATTR_NAME_CAPS) == 0) {
+		is_cap = true;
+		if (test_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags))
+			return -ENODATA;
+	}
+
+	ret = btrfs_getxattr(inode, name, buffer, size);
+	if (ret == -ENODATA && is_cap)
+		set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
+	return ret;
+}
+
+static int btrfs_xattr_handler_set_security(const struct xattr_handler *handler,
+					    struct mnt_idmap *idmap,
+					    struct dentry *unused,
+					    struct inode *inode,
+					    const char *name,
+					    const void *buffer,
+					    size_t size, int flags)
+{
+	if (btrfs_root_readonly(BTRFS_I(inode)->root))
+		return -EROFS;
+
+	name = xattr_full_name(handler, name);
+	if (strcmp(name, XATTR_NAME_CAPS) == 0)
+		clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
+
+	return btrfs_setxattr_trans(inode, name, buffer, size, flags);
+}
+
 static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
 					struct mnt_idmap *idmap,
 					struct dentry *unused, struct inode *inode,
@@ -420,8 +467,8 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
 
 static const struct xattr_handler btrfs_security_xattr_handler = {
 	.prefix = XATTR_SECURITY_PREFIX,
-	.get = btrfs_xattr_handler_get,
-	.set = btrfs_xattr_handler_set,
+	.get = btrfs_xattr_handler_get_security,
+	.set = btrfs_xattr_handler_set_security,
 };
 
 static const struct xattr_handler btrfs_trusted_xattr_handler = {
@@ -473,6 +520,10 @@ static int btrfs_initxattrs(struct inode *inode,
 		}
 		strcpy(name, XATTR_SECURITY_PREFIX);
 		strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
+
+		if (strcmp(name, XATTR_NAME_CAPS) == 0)
+			clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
+
 		err = btrfs_setxattr(trans, inode, name, xattr->value,
 				     xattr->value_len, 0);
 		kfree(name);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 6c231a116a29..36cf1f0e338e 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -121,7 +121,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
 	workspace->strm.total_in = 0;
 	workspace->strm.total_out = 0;
 
-	out_page = alloc_page(GFP_NOFS);
+	out_page = btrfs_alloc_compr_page();
 	if (out_page == NULL) {
 		ret = -ENOMEM;
 		goto out;
@@ -200,7 +200,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
 				ret = -E2BIG;
 				goto out;
 			}
-			out_page = alloc_page(GFP_NOFS);
+			out_page = btrfs_alloc_compr_page();
 			if (out_page == NULL) {
 				ret = -ENOMEM;
 				goto out;
@@ -236,7 +236,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
 				ret = -E2BIG;
 				goto out;
 			}
-			out_page = alloc_page(GFP_NOFS);
+			out_page = btrfs_alloc_compr_page();
 			if (out_page == NULL) {
 				ret = -ENOMEM;
 				goto out;
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 188378ca19c7..12066afc235c 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -781,7 +781,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
 	 * Check mount options here, because we might change fs_info->zoned
 	 * from fs_info->zone_size.
 	 */
-	ret = btrfs_check_mountopts_zoned(fs_info);
+	ret = btrfs_check_mountopts_zoned(fs_info, &fs_info->mount_opt);
 	if (ret)
 		return ret;
 
@@ -789,7 +789,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
 	return 0;
 }
 
-int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
+int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info, unsigned long *mount_opt)
 {
 	if (!btrfs_is_zoned(info))
 		return 0;
@@ -798,18 +798,21 @@ int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
 	 * Space cache writing is not COWed. Disable that to avoid write errors
 	 * in sequential zones.
 	 */
-	if (btrfs_test_opt(info, SPACE_CACHE)) {
+	if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) {
 		btrfs_err(info, "zoned: space cache v1 is not supported");
 		return -EINVAL;
 	}
 
-	if (btrfs_test_opt(info, NODATACOW)) {
+	if (btrfs_raw_test_opt(*mount_opt, NODATACOW)) {
 		btrfs_err(info, "zoned: NODATACOW not supported");
 		return -EINVAL;
 	}
 
-	btrfs_clear_and_info(info, DISCARD_ASYNC,
-			"zoned: async discard ignored and disabled for zoned mode");
+	if (btrfs_raw_test_opt(*mount_opt, DISCARD_ASYNC)) {
+		btrfs_info(info,
+			   "zoned: async discard ignored and disabled for zoned mode");
+		btrfs_clear_opt(*mount_opt, DISCARD_ASYNC);
+	}
 
 	return 0;
 }
@@ -1290,7 +1293,7 @@ struct zone_info {
 
 static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
 				struct zone_info *info, unsigned long *active,
-				struct map_lookup *map)
+				struct btrfs_chunk_map *map)
 {
 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 	struct btrfs_device *device = map->stripes[zone_idx].dev;
@@ -1393,7 +1396,7 @@ static int btrfs_load_block_group_single(struct btrfs_block_group *bg,
 }
 
 static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
-				      struct map_lookup *map,
+				      struct btrfs_chunk_map *map,
 				      struct zone_info *zone_info,
 				      unsigned long *active)
 {
@@ -1435,7 +1438,7 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
 }
 
 static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
-					struct map_lookup *map,
+					struct btrfs_chunk_map *map,
 					struct zone_info *zone_info,
 					unsigned long *active)
 {
@@ -1483,7 +1486,7 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
 }
 
 static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
-					struct map_lookup *map,
+					struct btrfs_chunk_map *map,
 					struct zone_info *zone_info,
 					unsigned long *active)
 {
@@ -1515,7 +1518,7 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
 }
 
 static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
-					 struct map_lookup *map,
+					 struct btrfs_chunk_map *map,
 					 struct zone_info *zone_info,
 					 unsigned long *active)
 {
@@ -1552,9 +1555,7 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
 int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
 {
 	struct btrfs_fs_info *fs_info = cache->fs_info;
-	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
-	struct extent_map *em;
-	struct map_lookup *map;
+	struct btrfs_chunk_map *map;
 	u64 logical = cache->start;
 	u64 length = cache->length;
 	struct zone_info *zone_info = NULL;
@@ -1575,17 +1576,11 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
 		return -EIO;
 	}
 
-	/* Get the chunk mapping */
-	read_lock(&em_tree->lock);
-	em = lookup_extent_mapping(em_tree, logical, length);
-	read_unlock(&em_tree->lock);
-
-	if (!em)
+	map = btrfs_find_chunk_map(fs_info, logical, length);
+	if (!map)
 		return -EINVAL;
 
-	map = em->map_lookup;
-
-	cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS);
+	cache->physical_map = btrfs_clone_chunk_map(map, GFP_NOFS);
 	if (!cache->physical_map) {
 		ret = -ENOMEM;
 		goto out;
@@ -1687,12 +1682,11 @@ out:
 			spin_unlock(&fs_info->zone_active_bgs_lock);
 		}
 	} else {
-		kfree(cache->physical_map);
+		btrfs_free_chunk_map(cache->physical_map);
 		cache->physical_map = NULL;
 	}
 	bitmap_free(active);
 	kfree(zone_info);
-	free_extent_map(em);
 
 	return ret;
 }
@@ -1715,22 +1709,6 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
 	cache->zone_unusable = unusable;
 }
 
-void btrfs_redirty_list_add(struct btrfs_transaction *trans,
-			    struct extent_buffer *eb)
-{
-	if (!btrfs_is_zoned(eb->fs_info) ||
-	    btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN))
-		return;
-
-	ASSERT(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
-
-	memzero_extent_buffer(eb, 0, eb->len);
-	set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags);
-	set_extent_buffer_dirty(eb);
-	set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1,
-			EXTENT_DIRTY, NULL);
-}
-
 bool btrfs_use_zone_append(struct btrfs_bio *bbio)
 {
 	u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT);
@@ -2082,7 +2060,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
 bool btrfs_zone_activate(struct btrfs_block_group *block_group)
 {
 	struct btrfs_fs_info *fs_info = block_group->fs_info;
-	struct map_lookup *map;
+	struct btrfs_chunk_map *map;
 	struct btrfs_device *device;
 	u64 physical;
 	const bool is_data = (block_group->flags & BTRFS_BLOCK_GROUP_DATA);
@@ -2194,7 +2172,7 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group)
 static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
 {
 	struct btrfs_fs_info *fs_info = block_group->fs_info;
-	struct map_lookup *map;
+	struct btrfs_chunk_map *map;
 	const bool is_metadata = (block_group->flags &
 			(BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM));
 	int ret = 0;
@@ -2643,7 +2621,7 @@ void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info)
 	/* Release reservation for currently active block groups. */
 	spin_lock(&fs_info->zone_active_bgs_lock);
 	list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
-		struct map_lookup *map = block_group->physical_map;
+		struct btrfs_chunk_map *map = block_group->physical_map;
 
 		if (!(block_group->flags &
 		      (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)))
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index b9cec523b778..f24a5ffb7807 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -45,7 +45,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache);
 void btrfs_destroy_dev_zone_info(struct btrfs_device *device);
 struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev);
 int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info);
-int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info);
+int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info, unsigned long *mount_opt);
 int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
 			       u64 *bytenr_ret);
 int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
@@ -59,8 +59,6 @@ int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
 int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size);
 int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new);
 void btrfs_calc_zone_unusable(struct btrfs_block_group *cache);
-void btrfs_redirty_list_add(struct btrfs_transaction *trans,
-			    struct extent_buffer *eb);
 bool btrfs_use_zone_append(struct btrfs_bio *bbio);
 void btrfs_record_physical_zoned(struct btrfs_bio *bbio);
 int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
@@ -123,7 +121,8 @@ static inline int btrfs_check_zoned_mode(const struct btrfs_fs_info *fs_info)
 	return -EOPNOTSUPP;
 }
 
-static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
+static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info,
+					      unsigned long *mount_opt)
 {
 	return 0;
 }
@@ -180,9 +179,6 @@ static inline int btrfs_load_block_group_zone_info(
 
 static inline void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) { }
 
-static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans,
-					  struct extent_buffer *eb) { }
-
 static inline bool btrfs_use_zone_append(struct btrfs_bio *bbio)
 {
 	return false;
@@ -323,7 +319,7 @@ static inline bool btrfs_check_device_zone_type(const struct btrfs_fs_info *fs_i
 			(bdev_zone_sectors(bdev) << SECTOR_SHIFT);
 	}
 
-	/* Do not allow Host Manged zoned device */
+	/* Do not allow Host Managed zoned device. */
 	return bdev_zoned_model(bdev) != BLK_ZONED_HM;
 }
 
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 5511766485cd..0d66db8bc1d4 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -410,9 +410,8 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
 	workspace->in_buf.pos = 0;
 	workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
 
-
 	/* Allocate and map in the output buffer */
-	out_page = alloc_page(GFP_NOFS);
+	out_page = btrfs_alloc_compr_page();
 	if (out_page == NULL) {
 		ret = -ENOMEM;
 		goto out;
@@ -457,7 +456,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
 				ret = -E2BIG;
 				goto out;
 			}
-			out_page = alloc_page(GFP_NOFS);
+			out_page = btrfs_alloc_compr_page();
 			if (out_page == NULL) {
 				ret = -ENOMEM;
 				goto out;
@@ -514,7 +513,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
 			ret = -E2BIG;
 			goto out;
 		}
-		out_page = alloc_page(GFP_NOFS);
+		out_page = btrfs_alloc_compr_page();
 		if (out_page == NULL) {
 			ret = -ENOMEM;
 			goto out;
diff --git a/fs/namespace.c b/fs/namespace.c
index 3beda4bb59af..95b2fff91f67 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2878,7 +2878,12 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags,
 	if (IS_ERR(fc))
 		return PTR_ERR(fc);
 
+	/*
+	 * Indicate to the filesystem that the remount request is coming
+	 * from the legacy mount system call.
+	 */
 	fc->oldapi = true;
+
 	err = parse_monolithic_mount_data(fc, data);
 	if (!err) {
 		down_write(&sb->s_umount);
@@ -3328,6 +3333,12 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
 	if (IS_ERR(fc))
 		return PTR_ERR(fc);
 
+	/*
+	 * Indicate to the filesystem that the mount request is coming
+	 * from the legacy mount system call.
+	 */
+	fc->oldapi = true;
+
 	if (subtype)
 		err = vfs_parse_fs_string(fc, "subtype",
 					  subtype, strlen(subtype));
author	Linus Torvalds <torvalds@linux-foundation.org>	2024-01-10 09:27:40 -0800
committer	Linus Torvalds <torvalds@linux-foundation.org>	2024-01-10 09:27:40 -0800
commit	affc5af36bbb62073b6aaa4f4459b38937ff5331 (patch)
tree	2267430b2cc94ce7c8e87ac362bcc28e89975db1 /fs
parent	12958e9c4c8e93ef694c10960c78453edf21526e (diff)
parent	e94dfb7a2935cb91faca88bf7136177d1ce0dda8 (diff)