Merge tag 'vfs-6.7.super' of gitolite.kernel.org:pub/scm/linux/kernel/git/vfs/vfs

Pull vfs superblock updates from Christian Brauner: "This contains the work to make block device opening functions return a struct bdev_handle instead of just a struct block_device. The same struct bdev_handle is then also passed to block device closing functions. This allows us to propagate context from opening to closing a block device without having to modify all users everytime. Sidenote, in the future we might even want to try and have block device opening functions return a struct file directly but that's a series on top of this. These are further preparatory changes to be able to count writable opens and blocking writes to mounted block devices. That's a separate piece of work for next cycle and for that we absolutely need the changes to btrfs that have been quietly dropped somehow. Originally the series contained a patch that removed the old blkdev_*() helpers. But since this would've caused needles churn in -next for bcachefs we ended up delaying it. The second piece of work addresses one of the major annoyances about the work last cycle, namely that we required dropping s_umount whenever we used the superblock and fs_holder_ops for a block device. The reason for that requirement had been that in some codepaths s_umount could've been taken under disk->open_mutex (that's always been the case, at least theoretically). For example, on surprise block device removal or media change. And opening and closing block devices required grabbing disk->open_mutex as well. So we did the work and went through the block layer and fixed all those places so that s_umount is never taken under disk->open_mutex. This means no more brittle games where we yield and reacquire s_umount during block device opening and closing and no more requirements where block devices need to be closed. Filesystems don't need to care about this. There's a bunch of other follow-up work such as moving block device freezing and thawing to holder operations which makes it work for all block devices and not just the main block device just as we did for surprise removal. But that is for next cycle. Tested with fstests for all major fses, blktests, LTP" * tag 'vfs-6.7.super' of gitolite.kernel.org:pub/scm/linux/kernel/git/vfs/vfs: (37 commits) porting: update locking requirements fs: assert that open_mutex isn't held over holder ops block: assert that we're not holding open_mutex over blk_report_disk_dead block: move bdev_mark_dead out of disk_check_media_change block: WARN_ON_ONCE() when we remove active partitions block: simplify bdev_del_partition() fs: Avoid grabbing sb->s_umount under bdev->bd_holder_lock jfs: fix log->bdev_handle null ptr deref in lbmStartIO bcache: Fixup error handling in register_cache() xfs: Convert to bdev_open_by_path() reiserfs: Convert to bdev_open_by_dev/path() ocfs2: Convert to use bdev_open_by_dev() nfs/blocklayout: Convert to use bdev_open_by_dev/path() jfs: Convert to bdev_open_by_dev() f2fs: Convert to bdev_open_by_dev/path() ext4: Convert to bdev_open_by_dev() erofs: Convert to use bdev_open_by_path() btrfs: Convert to bdev_open_by_path() fs: Convert to bdev_open_by_dev() mm/swap: Convert to use bdev_open_by_dev() ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2023-10-30 08:59:05 -1000
committer: Linus Torvalds <torvalds@linux-foundation.org> 2023-10-30 08:59:05 -1000
commit: d4e175f2c460fd54011117d835aa017d2d4a8c08 (patch)
tree: 7845d9a363f5f0f633f655d9da0643dda3af3b39 /block
parent: ffc253263a1375a65fa6c9f62a893e9767fbebfa (diff)
parent: 5aa9130acb98bacacc8bd9f1489a9269430d0eb8 (diff)
6 files changed, 142 insertions, 58 deletions
diff --git a/block/bdev.c b/block/bdev.c
index f3b13aa1b7d4..2018d250e131 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -829,6 +829,28 @@ put_blkdev:
 }
 EXPORT_SYMBOL(blkdev_get_by_dev);
 
+struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
+				     const struct blk_holder_ops *hops)
+{
+	struct bdev_handle *handle = kmalloc(sizeof(*handle), GFP_KERNEL);
+	struct block_device *bdev;
+
+	if (!handle)
+		return ERR_PTR(-ENOMEM);
+	bdev = blkdev_get_by_dev(dev, mode, holder, hops);
+	if (IS_ERR(bdev)) {
+		kfree(handle);
+		return ERR_CAST(bdev);
+	}
+	handle->bdev = bdev;
+	handle->holder = holder;
+	if (holder)
+		mode |= BLK_OPEN_EXCL;
+	handle->mode = mode;
+	return handle;
+}
+EXPORT_SYMBOL(bdev_open_by_dev);
+
 /**
  * blkdev_get_by_path - open a block device by name
  * @path: path to the block device to open
@@ -867,6 +889,28 @@ struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode,
 }
 EXPORT_SYMBOL(blkdev_get_by_path);
 
+struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
+		void *holder, const struct blk_holder_ops *hops)
+{
+	struct bdev_handle *handle;
+	dev_t dev;
+	int error;
+
+	error = lookup_bdev(path, &dev);
+	if (error)
+		return ERR_PTR(error);
+
+	handle = bdev_open_by_dev(dev, mode, holder, hops);
+	if (!IS_ERR(handle) && (mode & BLK_OPEN_WRITE) &&
+	    bdev_read_only(handle->bdev)) {
+		bdev_release(handle);
+		return ERR_PTR(-EACCES);
+	}
+
+	return handle;
+}
+EXPORT_SYMBOL(bdev_open_by_path);
+
 void blkdev_put(struct block_device *bdev, void *holder)
 {
 	struct gendisk *disk = bdev->bd_disk;
@@ -903,6 +947,13 @@ void blkdev_put(struct block_device *bdev, void *holder)
 }
 EXPORT_SYMBOL(blkdev_put);
 
+void bdev_release(struct bdev_handle *handle)
+{
+	blkdev_put(handle->bdev, handle->holder);
+	kfree(handle);
+}
+EXPORT_SYMBOL(bdev_release);
+
 /**
  * lookup_bdev() - Look up a struct block_device by name.
  * @pathname: Name of the block device in the filesystem.
@@ -961,20 +1012,20 @@ void bdev_mark_dead(struct block_device *bdev, bool surprise)
 	mutex_lock(&bdev->bd_holder_lock);
 	if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead)
 		bdev->bd_holder_ops->mark_dead(bdev, surprise);
-	else
+	else {
+		mutex_unlock(&bdev->bd_holder_lock);
 		sync_blockdev(bdev);
-	mutex_unlock(&bdev->bd_holder_lock);
+	}
 
 	invalidate_bdev(bdev);
 }
-#ifdef CONFIG_DASD_MODULE
 /*
- * Drivers should not use this directly, but the DASD driver has historically
- * had a shutdown to offline mode that doesn't actually remove the gendisk
- * that otherwise looks a lot like a safe device removal.
+ * New drivers should not use this directly.  There are some drivers however
+ * that needs this for historical reasons. For example, the DASD driver has
+ * historically had a shutdown to offline mode that doesn't actually remove the
+ * gendisk that otherwise looks a lot like a safe device removal.
  */
 EXPORT_SYMBOL_GPL(bdev_mark_dead);
-#endif
 
 void sync_bdevs(bool wait)
 {
diff --git a/block/disk-events.c b/block/disk-events.c
index 13c3372c465a..2f697224386a 100644
--- a/block/disk-events.c
+++ b/block/disk-events.c
@@ -266,11 +266,8 @@ static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
  * disk_check_media_change - check if a removable media has been changed
  * @disk: gendisk to check
  *
- * Check whether a removable media has been changed, and attempt to free all
- * dentries and inodes and invalidates all block device page cache entries in
- * that case.
- *
- * Returns %true if the media has changed, or %false if not.
+ * Returns %true and marks the disk for a partition rescan whether a removable
+ * media has been changed, and %false if the media did not change.
  */
 bool disk_check_media_change(struct gendisk *disk)
 {
@@ -278,12 +275,11 @@ bool disk_check_media_change(struct gendisk *disk)
 
 	events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
 				   DISK_EVENT_EJECT_REQUEST);
-	if (!(events & DISK_EVENT_MEDIA_CHANGE))
-		return false;
-
-	bdev_mark_dead(disk->part0, true);
-	set_bit(GD_NEED_PART_SCAN, &disk->state);
-	return true;
+	if (events & DISK_EVENT_MEDIA_CHANGE) {
+		set_bit(GD_NEED_PART_SCAN, &disk->state);
+		return true;
+	}
+	return false;
 }
 EXPORT_SYMBOL(disk_check_media_change);
 
diff --git a/block/fops.c b/block/fops.c
index 73e42742543f..0abaac705daf 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -542,15 +542,31 @@ static int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
 	return error;
 }
 
+/**
+ * file_to_blk_mode - get block open flags from file flags
+ * @file: file whose open flags should be converted
+ *
+ * Look at file open flags and generate corresponding block open flags from
+ * them. The function works both for file just being open (e.g. during ->open
+ * callback) and for file that is already open. This is actually non-trivial
+ * (see comment in the function).
+ */
 blk_mode_t file_to_blk_mode(struct file *file)
 {
 	blk_mode_t mode = 0;
+	struct bdev_handle *handle = file->private_data;
 
 	if (file->f_mode & FMODE_READ)
 		mode |= BLK_OPEN_READ;
 	if (file->f_mode & FMODE_WRITE)
 		mode |= BLK_OPEN_WRITE;
-	if (file->private_data)
+	/*
+	 * do_dentry_open() clears O_EXCL from f_flags, use handle->mode to
+	 * determine whether the open was exclusive for already open files.
+	 */
+	if (handle)
+		mode |= handle->mode & BLK_OPEN_EXCL;
+	else if (file->f_flags & O_EXCL)
 		mode |= BLK_OPEN_EXCL;
 	if (file->f_flags & O_NDELAY)
 		mode |= BLK_OPEN_NDELAY;
@@ -568,7 +584,8 @@ blk_mode_t file_to_blk_mode(struct file *file)
 
 static int blkdev_open(struct inode *inode, struct file *filp)
 {
-	struct block_device *bdev;
+	struct bdev_handle *handle;
+	blk_mode_t mode;
 
 	/*
 	 * Preserve backwards compatibility and allow large file access
@@ -579,29 +596,24 @@ static int blkdev_open(struct inode *inode, struct file *filp)
 	filp->f_flags |= O_LARGEFILE;
 	filp->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT;
 
-	/*
-	 * Use the file private data to store the holder for exclusive openes.
-	 * file_to_blk_mode relies on it being present to set BLK_OPEN_EXCL.
-	 */
-	if (filp->f_flags & O_EXCL)
-		filp->private_data = filp;
-
-	bdev = blkdev_get_by_dev(inode->i_rdev, file_to_blk_mode(filp),
-				 filp->private_data, NULL);
-	if (IS_ERR(bdev))
-		return PTR_ERR(bdev);
+	mode = file_to_blk_mode(filp);
+	handle = bdev_open_by_dev(inode->i_rdev, mode,
+			mode & BLK_OPEN_EXCL ? filp : NULL, NULL);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
 
-	if (bdev_nowait(bdev))
+	if (bdev_nowait(handle->bdev))
 		filp->f_mode |= FMODE_NOWAIT;
 
-	filp->f_mapping = bdev->bd_inode->i_mapping;
+	filp->f_mapping = handle->bdev->bd_inode->i_mapping;
 	filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
+	filp->private_data = handle;
 	return 0;
 }
 
 static int blkdev_release(struct inode *inode, struct file *filp)
 {
-	blkdev_put(I_BDEV(filp->f_mapping->host), filp->private_data);
+	bdev_release(filp->private_data);
 	return 0;
 }
 
diff --git a/block/genhd.c b/block/genhd.c
index cc32a0c704eb..c9d06f72c587 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -342,7 +342,7 @@ EXPORT_SYMBOL_GPL(disk_uevent);
 
 int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode)
 {
-	struct block_device *bdev;
+	struct bdev_handle *handle;
 	int ret = 0;
 
 	if (disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN))
@@ -366,12 +366,12 @@ int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode)
 	}
 
 	set_bit(GD_NEED_PART_SCAN, &disk->state);
-	bdev = blkdev_get_by_dev(disk_devt(disk), mode & ~BLK_OPEN_EXCL, NULL,
-				 NULL);
-	if (IS_ERR(bdev))
-		ret =  PTR_ERR(bdev);
+	handle = bdev_open_by_dev(disk_devt(disk), mode & ~BLK_OPEN_EXCL, NULL,
+				  NULL);
+	if (IS_ERR(handle))
+		ret = PTR_ERR(handle);
 	else
-		blkdev_put(bdev, NULL);
+		bdev_release(handle);
 
 	/*
 	 * If blkdev_get_by_dev() failed early, GD_NEED_PART_SCAN is still set,
@@ -559,6 +559,13 @@ static void blk_report_disk_dead(struct gendisk *disk, bool surprise)
 	struct block_device *bdev;
 	unsigned long idx;
 
+	/*
+	 * On surprise disk removal, bdev_mark_dead() may call into file
+	 * systems below. Make it clear that we're expecting to not hold
+	 * disk->open_mutex.
+	 */
+	lockdep_assert_not_held(&disk->open_mutex);
+
 	rcu_read_lock();
 	xa_for_each(&disk->part_tbl, idx, bdev) {
 		if (!kobject_get_unless_zero(&bdev->bd_device.kobj))
diff --git a/block/ioctl.c b/block/ioctl.c
index d5f5cd61efd7..4160f4e6bd5b 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -370,9 +370,10 @@ static int blkdev_flushbuf(struct block_device *bdev, unsigned cmd,
 	mutex_lock(&bdev->bd_holder_lock);
 	if (bdev->bd_holder_ops && bdev->bd_holder_ops->sync)
 		bdev->bd_holder_ops->sync(bdev);
-	else
+	else {
+		mutex_unlock(&bdev->bd_holder_lock);
 		sync_blockdev(bdev);
-	mutex_unlock(&bdev->bd_holder_lock);
+	}
 
 	invalidate_bdev(bdev);
 	return 0;
@@ -467,6 +468,7 @@ static int blkdev_bszset(struct block_device *bdev, blk_mode_t mode,
 		int __user *argp)
 {
 	int ret, n;
+	struct bdev_handle *handle;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
@@ -478,10 +480,11 @@ static int blkdev_bszset(struct block_device *bdev, blk_mode_t mode,
 	if (mode & BLK_OPEN_EXCL)
 		return set_blocksize(bdev, n);
 
-	if (IS_ERR(blkdev_get_by_dev(bdev->bd_dev, mode, &bdev, NULL)))
+	handle = bdev_open_by_dev(bdev->bd_dev, mode, &bdev, NULL);
+	if (IS_ERR(handle))
 		return -EBUSY;
 	ret = set_blocksize(bdev, n);
-	blkdev_put(bdev, &bdev);
+	bdev_release(handle);
 
 	return ret;
 }
diff --git a/block/partitions/core.c b/block/partitions/core.c
index e137a87f4db0..f47ffcfdfcec 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -274,17 +274,6 @@ void drop_partition(struct block_device *part)
 	put_device(&part->bd_device);
 }
 
-static void delete_partition(struct block_device *part)
-{
-	/*
-	 * Remove the block device from the inode hash, so that it cannot be
-	 * looked up any more even when openers still hold references.
-	 */
-	remove_inode_hash(part->bd_inode);
-	bdev_mark_dead(part, false);
-	drop_partition(part);
-}
-
 static ssize_t whole_disk_show(struct device *dev,
 			       struct device_attribute *attr, char *buf)
 {
@@ -485,7 +474,18 @@ int bdev_del_partition(struct gendisk *disk, int partno)
 	if (atomic_read(&part->bd_openers))
 		goto out_unlock;
 
-	delete_partition(part);
+	/*
+	 * We verified that @part->bd_openers is zero above and so
+	 * @part->bd_holder{_ops} can't be set. And since we hold
+	 * @disk->open_mutex the device can't be claimed by anyone.
+	 *
+	 * So no need to call @part->bd_holder_ops->mark_dead() here.
+	 * Just delete the partition and invalidate it.
+	 */
+
+	remove_inode_hash(part->bd_inode);
+	invalidate_bdev(part);
+	drop_partition(part);
 	ret = 0;
 out_unlock:
 	mutex_unlock(&disk->open_mutex);
@@ -663,8 +663,23 @@ rescan:
 	sync_blockdev(disk->part0);
 	invalidate_bdev(disk->part0);
 
-	xa_for_each_start(&disk->part_tbl, idx, part, 1)
-		delete_partition(part);
+	xa_for_each_start(&disk->part_tbl, idx, part, 1) {
+		/*
+		 * Remove the block device from the inode hash, so that
+		 * it cannot be looked up any more even when openers
+		 * still hold references.
+		 */
+		remove_inode_hash(part->bd_inode);
+
+		/*
+		 * If @disk->open_partitions isn't elevated but there's
+		 * still an active holder of that block device things
+		 * are broken.
+		 */
+		WARN_ON_ONCE(atomic_read(&part->bd_openers));
+		invalidate_bdev(part);
+		drop_partition(part);
+	}
 	clear_bit(GD_NEED_PART_SCAN, &disk->state);
 
 	/*
author	Linus Torvalds <torvalds@linux-foundation.org>	2023-10-30 08:59:05 -1000
committer	Linus Torvalds <torvalds@linux-foundation.org>	2023-10-30 08:59:05 -1000
commit	d4e175f2c460fd54011117d835aa017d2d4a8c08 (patch)
tree	7845d9a363f5f0f633f655d9da0643dda3af3b39 /block
parent	ffc253263a1375a65fa6c9f62a893e9767fbebfa (diff)
parent	5aa9130acb98bacacc8bd9f1489a9269430d0eb8 (diff)