diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-07-20 08:11:30 -0700 | 
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-07-20 08:11:30 -0700 | 
| commit | 46670259519f4ee4ab378dc014798aabe77c5057 (patch) | |
| tree | e257a04d15a594f35650bea780a242c79b5c56d1 | |
| parent | 2922800a1803f6319e329bdbfd2962fd83eb5360 (diff) | |
| parent | aa84ce8a78a1a5c10cdf9c7a5fb0c999fbc2c8d6 (diff) | |
Merge tag 'for-6.5-rc2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs fixes from David Sterba:
 "Stable fixes:
   - fix race between balance and cancel/pause
   - various iput() fixes
   - fix use-after-free of new block group that became unused
   - fix warning when putting transaction with qgroups enabled after
     abort
   - fix crash in subpage mode when page could be released between map
     and map read
   - when scrubbing raid56 verify the P/Q stripes unconditionally
   - fix minor memory leak in zoned mode when a block group with an
     unexpected superblock is found
  Regression fixes:
   - fix ordered extent split error handling when submitting direct IO
   - user irq-safe locking when adding delayed iputs"
* tag 'for-6.5-rc2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: fix warning when putting transaction with qgroups enabled after abort
  btrfs: fix ordered extent split error handling in btrfs_dio_submit_io
  btrfs: set_page_extent_mapped after read_folio in btrfs_cont_expand
  btrfs: raid56: always verify the P/Q contents for scrub
  btrfs: use irq safe locking when running and adding delayed iputs
  btrfs: fix iput() on error pointer after error during orphan cleanup
  btrfs: fix double iput() on inode after an error during orphan cleanup
  btrfs: zoned: fix memory leak after finding block group with super blocks
  btrfs: fix use-after-free of new block group that became unused
  btrfs: be a bit more careful when setting mirror_num_ret in btrfs_map_block
  btrfs: fix race between balance and cancel/pause
| -rw-r--r-- | fs/btrfs/block-group.c | 14 | ||||
| -rw-r--r-- | fs/btrfs/block-group.h | 5 | ||||
| -rw-r--r-- | fs/btrfs/inode.c | 77 | ||||
| -rw-r--r-- | fs/btrfs/qgroup.c | 1 | ||||
| -rw-r--r-- | fs/btrfs/raid56.c | 11 | ||||
| -rw-r--r-- | fs/btrfs/volumes.c | 17 | 
6 files changed, 79 insertions, 46 deletions
| diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 48ae509f2ac2..23726152d62d 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1640,13 +1640,14 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)  {  	struct btrfs_fs_info *fs_info = bg->fs_info; -	trace_btrfs_add_unused_block_group(bg);  	spin_lock(&fs_info->unused_bgs_lock);  	if (list_empty(&bg->bg_list)) {  		btrfs_get_block_group(bg); +		trace_btrfs_add_unused_block_group(bg);  		list_add_tail(&bg->bg_list, &fs_info->unused_bgs); -	} else { +	} else if (!test_bit(BLOCK_GROUP_FLAG_NEW, &bg->runtime_flags)) {  		/* Pull out the block group from the reclaim_bgs list. */ +		trace_btrfs_add_unused_block_group(bg);  		list_move_tail(&bg->bg_list, &fs_info->unused_bgs);  	}  	spin_unlock(&fs_info->unused_bgs_lock); @@ -2087,6 +2088,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)  		/* Shouldn't have super stripes in sequential zones */  		if (zoned && nr) { +			kfree(logical);  			btrfs_err(fs_info,  			"zoned: block group %llu must not contain super block",  				  cache->start); @@ -2668,6 +2670,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)  next:  		btrfs_delayed_refs_rsv_release(fs_info, 1);  		list_del_init(&block_group->bg_list); +		clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);  	}  	btrfs_trans_release_chunk_metadata(trans);  } @@ -2707,6 +2710,13 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran  	if (!cache)  		return ERR_PTR(-ENOMEM); +	/* +	 * Mark it as new before adding it to the rbtree of block groups or any +	 * list, so that no other task finds it and calls btrfs_mark_bg_unused() +	 * before the new flag is set. +	 */ +	set_bit(BLOCK_GROUP_FLAG_NEW, &cache->runtime_flags); +  	cache->length = size;  	set_free_space_tree_thresholds(cache);  	cache->flags = type; diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index f204addc3fe8..381c54a56417 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -70,6 +70,11 @@ enum btrfs_block_group_flags {  	BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,  	/* Indicate that the block group is placed on a sequential zone */  	BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, +	/* +	 * Indicate that block group is in the list of new block groups of a +	 * transaction. +	 */ +	BLOCK_GROUP_FLAG_NEW,  };  enum btrfs_caching_type { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dbbb67293e34..49cef61f6a39 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3482,15 +3482,21 @@ zeroit:  void btrfs_add_delayed_iput(struct btrfs_inode *inode)  {  	struct btrfs_fs_info *fs_info = inode->root->fs_info; +	unsigned long flags;  	if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1))  		return;  	atomic_inc(&fs_info->nr_delayed_iputs); -	spin_lock(&fs_info->delayed_iput_lock); +	/* +	 * Need to be irq safe here because we can be called from either an irq +	 * context (see bio.c and btrfs_put_ordered_extent()) or a non-irq +	 * context. +	 */ +	spin_lock_irqsave(&fs_info->delayed_iput_lock, flags);  	ASSERT(list_empty(&inode->delayed_iput));  	list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs); -	spin_unlock(&fs_info->delayed_iput_lock); +	spin_unlock_irqrestore(&fs_info->delayed_iput_lock, flags);  	if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))  		wake_up_process(fs_info->cleaner_kthread);  } @@ -3499,37 +3505,46 @@ static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,  				    struct btrfs_inode *inode)  {  	list_del_init(&inode->delayed_iput); -	spin_unlock(&fs_info->delayed_iput_lock); +	spin_unlock_irq(&fs_info->delayed_iput_lock);  	iput(&inode->vfs_inode);  	if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))  		wake_up(&fs_info->delayed_iputs_wait); -	spin_lock(&fs_info->delayed_iput_lock); +	spin_lock_irq(&fs_info->delayed_iput_lock);  }  static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,  				   struct btrfs_inode *inode)  {  	if (!list_empty(&inode->delayed_iput)) { -		spin_lock(&fs_info->delayed_iput_lock); +		spin_lock_irq(&fs_info->delayed_iput_lock);  		if (!list_empty(&inode->delayed_iput))  			run_delayed_iput_locked(fs_info, inode); -		spin_unlock(&fs_info->delayed_iput_lock); +		spin_unlock_irq(&fs_info->delayed_iput_lock);  	}  }  void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)  { - -	spin_lock(&fs_info->delayed_iput_lock); +	/* +	 * btrfs_put_ordered_extent() can run in irq context (see bio.c), which +	 * calls btrfs_add_delayed_iput() and that needs to lock +	 * fs_info->delayed_iput_lock. So we need to disable irqs here to +	 * prevent a deadlock. +	 */ +	spin_lock_irq(&fs_info->delayed_iput_lock);  	while (!list_empty(&fs_info->delayed_iputs)) {  		struct btrfs_inode *inode;  		inode = list_first_entry(&fs_info->delayed_iputs,  				struct btrfs_inode, delayed_iput);  		run_delayed_iput_locked(fs_info, inode); -		cond_resched_lock(&fs_info->delayed_iput_lock); +		if (need_resched()) { +			spin_unlock_irq(&fs_info->delayed_iput_lock); +			cond_resched(); +			spin_lock_irq(&fs_info->delayed_iput_lock); +		}  	} -	spin_unlock(&fs_info->delayed_iput_lock); +	spin_unlock_irq(&fs_info->delayed_iput_lock);  }  /* @@ -3659,11 +3674,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)  		found_key.type = BTRFS_INODE_ITEM_KEY;  		found_key.offset = 0;  		inode = btrfs_iget(fs_info->sb, last_objectid, root); -		ret = PTR_ERR_OR_ZERO(inode); -		if (ret && ret != -ENOENT) -			goto out; +		if (IS_ERR(inode)) { +			ret = PTR_ERR(inode); +			inode = NULL; +			if (ret != -ENOENT) +				goto out; +		} -		if (ret == -ENOENT && root == fs_info->tree_root) { +		if (!inode && root == fs_info->tree_root) {  			struct btrfs_root *dead_root;  			int is_dead_root = 0; @@ -3724,17 +3742,17 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)  		 * deleted but wasn't. The inode number may have been reused,  		 * but either way, we can delete the orphan item.  		 */ -		if (ret == -ENOENT || inode->i_nlink) { -			if (!ret) { +		if (!inode || inode->i_nlink) { +			if (inode) {  				ret = btrfs_drop_verity_items(BTRFS_I(inode));  				iput(inode); +				inode = NULL;  				if (ret)  					goto out;  			}  			trans = btrfs_start_transaction(root, 1);  			if (IS_ERR(trans)) {  				ret = PTR_ERR(trans); -				iput(inode);  				goto out;  			}  			btrfs_debug(fs_info, "auto deleting %Lu", @@ -3742,10 +3760,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)  			ret = btrfs_del_orphan_item(trans, root,  						    found_key.objectid);  			btrfs_end_transaction(trans); -			if (ret) { -				iput(inode); +			if (ret)  				goto out; -			}  			continue;  		} @@ -4847,9 +4863,6 @@ again:  		ret = -ENOMEM;  		goto out;  	} -	ret = set_page_extent_mapped(page); -	if (ret < 0) -		goto out_unlock;  	if (!PageUptodate(page)) {  		ret = btrfs_read_folio(NULL, page_folio(page)); @@ -4864,6 +4877,17 @@ again:  			goto out_unlock;  		}  	} + +	/* +	 * We unlock the page after the io is completed and then re-lock it +	 * above.  release_folio() could have come in between that and cleared +	 * PagePrivate(), but left the page in the mapping.  Set the page mapped +	 * here to make sure it's properly set for the subpage stuff. +	 */ +	ret = set_page_extent_mapped(page); +	if (ret < 0) +		goto out_unlock; +  	wait_on_page_writeback(page);  	lock_extent(io_tree, block_start, block_end, &cached_state); @@ -7849,8 +7873,11 @@ static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,  		ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);  		if (ret) { -			bbio->bio.bi_status = errno_to_blk_status(ret); -			btrfs_dio_end_io(bbio); +			btrfs_finish_ordered_extent(dio_data->ordered, NULL, +						    file_offset, dip->bytes, +						    !ret); +			bio->bi_status = errno_to_blk_status(ret); +			iomap_dio_bio_end_io(bio);  			return;  		}  	} diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index da1f84a0eb29..2637d6b157ff 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -4445,4 +4445,5 @@ void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)  		ulist_free(entry->old_roots);  		kfree(entry);  	} +	*root = RB_ROOT;  } diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index f37b925d587f..0249ea52bb80 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -71,7 +71,7 @@ static void rmw_rbio_work_locked(struct work_struct *work);  static void index_rbio_pages(struct btrfs_raid_bio *rbio);  static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); -static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check); +static int finish_parity_scrub(struct btrfs_raid_bio *rbio);  static void scrub_rbio_work_locked(struct work_struct *work);  static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio) @@ -2404,7 +2404,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)  	return 0;  } -static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) +static int finish_parity_scrub(struct btrfs_raid_bio *rbio)  {  	struct btrfs_io_context *bioc = rbio->bioc;  	const u32 sectorsize = bioc->fs_info->sectorsize; @@ -2445,9 +2445,6 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check)  	 */  	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); -	if (!need_check) -		goto writeback; -  	p_sector.page = alloc_page(GFP_NOFS);  	if (!p_sector.page)  		return -ENOMEM; @@ -2516,7 +2513,6 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check)  		q_sector.page = NULL;  	} -writeback:  	/*  	 * time to start writing.  Make bios for everything from the  	 * higher layers (the bio_list in our rbio) and our p/q.  Ignore @@ -2699,7 +2695,6 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio)  static void scrub_rbio(struct btrfs_raid_bio *rbio)  { -	bool need_check = false;  	int sector_nr;  	int ret; @@ -2722,7 +2717,7 @@ static void scrub_rbio(struct btrfs_raid_bio *rbio)  	 * We have every sector properly prepared. Can finish the scrub  	 * and writeback the good content.  	 */ -	ret = finish_parity_scrub(rbio, need_check); +	ret = finish_parity_scrub(rbio);  	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);  	for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {  		int found_errors; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 73f9ea7672db..2ecb76cf3d91 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4078,14 +4078,6 @@ static int alloc_profile_is_valid(u64 flags, int extended)  	return has_single_bit_set(flags);  } -static inline int balance_need_close(struct btrfs_fs_info *fs_info) -{ -	/* cancel requested || normal exit path */ -	return atomic_read(&fs_info->balance_cancel_req) || -		(atomic_read(&fs_info->balance_pause_req) == 0 && -		 atomic_read(&fs_info->balance_cancel_req) == 0); -} -  /*   * Validate target profile against allowed profiles and return true if it's OK.   * Otherwise print the error message and return false. @@ -4275,6 +4267,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,  	u64 num_devices;  	unsigned seq;  	bool reducing_redundancy; +	bool paused = false;  	int i;  	if (btrfs_fs_closing(fs_info) || @@ -4405,6 +4398,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,  	if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) {  		btrfs_info(fs_info, "balance: paused");  		btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED); +		paused = true;  	}  	/*  	 * Balance can be canceled by: @@ -4433,8 +4427,8 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,  		btrfs_update_ioctl_balance_args(fs_info, bargs);  	} -	if ((ret && ret != -ECANCELED && ret != -ENOSPC) || -	    balance_need_close(fs_info)) { +	/* We didn't pause, we can clean everything up. */ +	if (!paused) {  		reset_balance_state(fs_info);  		btrfs_exclop_finish(fs_info);  	} @@ -6404,7 +6398,8 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,  	    (op == BTRFS_MAP_READ || !dev_replace_is_ongoing ||  	     !dev_replace->tgtdev)) {  		set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr); -		*mirror_num_ret = mirror_num; +		if (mirror_num_ret) +			*mirror_num_ret = mirror_num;  		*bioc_ret = NULL;  		ret = 0;  		goto out; | 
