From 2d3a8e2deddea6c89961c422ec0c5b851e648c14 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Tue, 16 Jun 2020 20:16:55 +0800 Subject: block: Fix use-after-free in blkdev_get() In blkdev_get() we call __blkdev_get() to do some internal jobs and if there is some errors in __blkdev_get(), the bdput() is called which means we have released the refcount of the bdev (actually the refcount of the bdev inode). This means we cannot access bdev after that point. But acctually bdev is still accessed in blkdev_get() after calling __blkdev_get(). This results in use-after-free if the refcount is the last one we released in __blkdev_get(). Let's take a look at the following scenerio: CPU0 CPU1 CPU2 blkdev_open blkdev_open Remove disk bd_acquire blkdev_get __blkdev_get del_gendisk bdev_unhash_inode bd_acquire bdev_get_gendisk bd_forget failed because of unhashed bdput bdput (the last one) bdev_evict_inode access bdev => use after free [ 459.350216] BUG: KASAN: use-after-free in __lock_acquire+0x24c1/0x31b0 [ 459.351190] Read of size 8 at addr ffff88806c815a80 by task syz-executor.0/20132 [ 459.352347] [ 459.352594] CPU: 0 PID: 20132 Comm: syz-executor.0 Not tainted 4.19.90 #2 [ 459.353628] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 [ 459.354947] Call Trace: [ 459.355337] dump_stack+0x111/0x19e [ 459.355879] ? __lock_acquire+0x24c1/0x31b0 [ 459.356523] print_address_description+0x60/0x223 [ 459.357248] ? __lock_acquire+0x24c1/0x31b0 [ 459.357887] kasan_report.cold+0xae/0x2d8 [ 459.358503] __lock_acquire+0x24c1/0x31b0 [ 459.359120] ? _raw_spin_unlock_irq+0x24/0x40 [ 459.359784] ? lockdep_hardirqs_on+0x37b/0x580 [ 459.360465] ? _raw_spin_unlock_irq+0x24/0x40 [ 459.361123] ? finish_task_switch+0x125/0x600 [ 459.361812] ? finish_task_switch+0xee/0x600 [ 459.362471] ? mark_held_locks+0xf0/0xf0 [ 459.363108] ? __schedule+0x96f/0x21d0 [ 459.363716] lock_acquire+0x111/0x320 [ 459.364285] ? blkdev_get+0xce/0xbe0 [ 459.364846] ? blkdev_get+0xce/0xbe0 [ 459.365390] __mutex_lock+0xf9/0x12a0 [ 459.365948] ? blkdev_get+0xce/0xbe0 [ 459.366493] ? bdev_evict_inode+0x1f0/0x1f0 [ 459.367130] ? blkdev_get+0xce/0xbe0 [ 459.367678] ? destroy_inode+0xbc/0x110 [ 459.368261] ? mutex_trylock+0x1a0/0x1a0 [ 459.368867] ? __blkdev_get+0x3e6/0x1280 [ 459.369463] ? bdev_disk_changed+0x1d0/0x1d0 [ 459.370114] ? blkdev_get+0xce/0xbe0 [ 459.370656] blkdev_get+0xce/0xbe0 [ 459.371178] ? find_held_lock+0x2c/0x110 [ 459.371774] ? __blkdev_get+0x1280/0x1280 [ 459.372383] ? lock_downgrade+0x680/0x680 [ 459.373002] ? lock_acquire+0x111/0x320 [ 459.373587] ? bd_acquire+0x21/0x2c0 [ 459.374134] ? do_raw_spin_unlock+0x4f/0x250 [ 459.374780] blkdev_open+0x202/0x290 [ 459.375325] do_dentry_open+0x49e/0x1050 [ 459.375924] ? blkdev_get_by_dev+0x70/0x70 [ 459.376543] ? __x64_sys_fchdir+0x1f0/0x1f0 [ 459.377192] ? inode_permission+0xbe/0x3a0 [ 459.377818] path_openat+0x148c/0x3f50 [ 459.378392] ? kmem_cache_alloc+0xd5/0x280 [ 459.379016] ? entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 459.379802] ? path_lookupat.isra.0+0x900/0x900 [ 459.380489] ? __lock_is_held+0xad/0x140 [ 459.381093] do_filp_open+0x1a1/0x280 [ 459.381654] ? may_open_dev+0xf0/0xf0 [ 459.382214] ? find_held_lock+0x2c/0x110 [ 459.382816] ? lock_downgrade+0x680/0x680 [ 459.383425] ? __lock_is_held+0xad/0x140 [ 459.384024] ? do_raw_spin_unlock+0x4f/0x250 [ 459.384668] ? _raw_spin_unlock+0x1f/0x30 [ 459.385280] ? __alloc_fd+0x448/0x560 [ 459.385841] do_sys_open+0x3c3/0x500 [ 459.386386] ? filp_open+0x70/0x70 [ 459.386911] ? trace_hardirqs_on_thunk+0x1a/0x1c [ 459.387610] ? trace_hardirqs_off_caller+0x55/0x1c0 [ 459.388342] ? do_syscall_64+0x1a/0x520 [ 459.388930] do_syscall_64+0xc3/0x520 [ 459.389490] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 459.390248] RIP: 0033:0x416211 [ 459.390720] Code: 75 14 b8 02 00 00 00 0f 05 48 3d 01 f0 ff ff 0f 83 04 19 00 00 c3 48 83 ec 08 e8 0a fa ff ff 48 89 04 24 b8 02 00 00 00 0f 05 <48> 8b 3c 24 48 89 c2 e8 53 fa ff ff 48 89 d0 48 83 c4 08 48 3d 01 [ 459.393483] RSP: 002b:00007fe45dfe9a60 EFLAGS: 00000293 ORIG_RAX: 0000000000000002 [ 459.394610] RAX: ffffffffffffffda RBX: 00007fe45dfea6d4 RCX: 0000000000416211 [ 459.395678] RDX: 00007fe45dfe9b0a RSI: 0000000000000002 RDI: 00007fe45dfe9b00 [ 459.396758] RBP: 000000000076bf20 R08: 0000000000000000 R09: 000000000000000a [ 459.397930] R10: 0000000000000075 R11: 0000000000000293 R12: 00000000ffffffff [ 459.399022] R13: 0000000000000bd9 R14: 00000000004cdb80 R15: 000000000076bf2c [ 459.400168] [ 459.400430] Allocated by task 20132: [ 459.401038] kasan_kmalloc+0xbf/0xe0 [ 459.401652] kmem_cache_alloc+0xd5/0x280 [ 459.402330] bdev_alloc_inode+0x18/0x40 [ 459.402970] alloc_inode+0x5f/0x180 [ 459.403510] iget5_locked+0x57/0xd0 [ 459.404095] bdget+0x94/0x4e0 [ 459.404607] bd_acquire+0xfa/0x2c0 [ 459.405113] blkdev_open+0x110/0x290 [ 459.405702] do_dentry_open+0x49e/0x1050 [ 459.406340] path_openat+0x148c/0x3f50 [ 459.406926] do_filp_open+0x1a1/0x280 [ 459.407471] do_sys_open+0x3c3/0x500 [ 459.408010] do_syscall_64+0xc3/0x520 [ 459.408572] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 459.409415] [ 459.409679] Freed by task 1262: [ 459.410212] __kasan_slab_free+0x129/0x170 [ 459.410919] kmem_cache_free+0xb2/0x2a0 [ 459.411564] rcu_process_callbacks+0xbb2/0x2320 [ 459.412318] __do_softirq+0x225/0x8ac Fix this by delaying bdput() to the end of blkdev_get() which means we have finished accessing bdev. Fixes: 77ea887e433a ("implement in-kernel gendisk events handling") Reported-by: Hulk Robot Signed-off-by: Jason Yan Tested-by: Sedat Dilek Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Reviewed-by: Dan Carpenter Cc: Christoph Hellwig Cc: Jens Axboe Cc: Ming Lei Cc: Jan Kara Cc: Dan Carpenter Signed-off-by: Jens Axboe --- fs/block_dev.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'fs/block_dev.c') diff --git a/fs/block_dev.c b/fs/block_dev.c index 47860e589388..08c87db3a92b 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1565,10 +1565,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) */ if (!for_part) { ret = devcgroup_inode_permission(bdev->bd_inode, perm); - if (ret != 0) { - bdput(bdev); + if (ret != 0) return ret; - } } restart: @@ -1637,8 +1635,10 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto out_clear; BUG_ON(for_part); ret = __blkdev_get(whole, mode, 1); - if (ret) + if (ret) { + bdput(whole); goto out_clear; + } bdev->bd_contains = whole; bdev->bd_part = disk_get_part(disk, partno); if (!(disk->flags & GENHD_FL_UP) || @@ -1688,7 +1688,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) disk_unblock_events(disk); put_disk_and_module(disk); out: - bdput(bdev); return ret; } @@ -1755,6 +1754,9 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) bdput(whole); } + if (res) + bdput(bdev); + return res; } EXPORT_SYMBOL(blkdev_get); -- cgit From 3373a3461aa15b7f9a871fa4cb2c9ef21ac20b47 Mon Sep 17 00:00:00 2001 From: Zheng Bin Date: Thu, 18 Jun 2020 12:21:38 +0800 Subject: block: make function 'kill_bdev' static kill_bdev does not have any external user, so make it static. Signed-off-by: Zheng Bin Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- fs/block_dev.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs/block_dev.c') diff --git a/fs/block_dev.c b/fs/block_dev.c index 08c87db3a92b..0ae656e022fd 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -75,7 +75,7 @@ static void bdev_write_inode(struct block_device *bdev) } /* Kill _all_ buffers and pagecache , dirty or not.. */ -void kill_bdev(struct block_device *bdev) +static void kill_bdev(struct block_device *bdev) { struct address_space *mapping = bdev->bd_inode->i_mapping; @@ -84,8 +84,7 @@ void kill_bdev(struct block_device *bdev) invalidate_bh_lrus(); truncate_inode_pages(mapping, 0); -} -EXPORT_SYMBOL(kill_bdev); +} /* Invalidate clean unused buffers and pagecache. */ void invalidate_bdev(struct block_device *bdev) -- cgit From a304f0744824fd37d6e1aab4f9715f907724ad11 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 22 May 2020 09:14:08 -0600 Subject: block: flag block devices as supporting IOCB_WAITQ Signed-off-by: Jens Axboe --- fs/block_dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/block_dev.c') diff --git a/fs/block_dev.c b/fs/block_dev.c index 0ae656e022fd..679d9346b871 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1851,7 +1851,7 @@ static int blkdev_open(struct inode * inode, struct file * filp) */ filp->f_flags |= O_LARGEFILE; - filp->f_mode |= FMODE_NOWAIT; + filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; if (filp->f_flags & O_NDELAY) filp->f_mode |= FMODE_NDELAY; -- cgit From 764b23bd9af8ff8ecc664816e39d4791b6a72bfd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 20 Jun 2020 09:16:36 +0200 Subject: block: mark bd_finish_claiming static Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- fs/block_dev.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs/block_dev.c') diff --git a/fs/block_dev.c b/fs/block_dev.c index 0ae656e022fd..0e0d43dc27d3 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1187,8 +1187,8 @@ static void bd_clear_claiming(struct block_device *whole, void *holder) * Finish exclusive open of a block device. Mark the device as exlusively * open by the holder and wake up all waiters for exclusive open to finish. */ -void bd_finish_claiming(struct block_device *bdev, struct block_device *whole, - void *holder) +static void bd_finish_claiming(struct block_device *bdev, + struct block_device *whole, void *holder) { spin_lock(&bdev_lock); BUG_ON(!bd_may_claim(bdev, whole, holder)); @@ -1203,7 +1203,6 @@ void bd_finish_claiming(struct block_device *bdev, struct block_device *whole, bd_clear_claiming(whole, holder); spin_unlock(&bdev_lock); } -EXPORT_SYMBOL(bd_finish_claiming); /** * bd_abort_claiming - abort claiming of a block device -- cgit From 5ff9f19231a0e670b3d79c563f1b0b185abeca91 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 26 Jun 2020 10:01:54 +0200 Subject: block: simplify set_init_blocksize The loop to increase the initial block size doesn't really make any sense, as the AND operation won't match for powers of two if it didn't for the initial block size. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs/block_dev.c') diff --git a/fs/block_dev.c b/fs/block_dev.c index 0e0d43dc27d3..8b7a9c76d33e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -106,13 +106,7 @@ EXPORT_SYMBOL(invalidate_bdev); static void set_init_blocksize(struct block_device *bdev) { unsigned bsize = bdev_logical_block_size(bdev); - loff_t size = i_size_read(bdev->bd_inode); - while (bsize < PAGE_SIZE) { - if (size & bsize) - break; - bsize <<= 1; - } bdev->bd_block_size = bsize; bdev->bd_inode->i_blkbits = blksize_bits(bsize); } -- cgit From 6b7b181b67aa8177e57732723106a0411570a86d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 26 Jun 2020 10:01:55 +0200 Subject: block: remove the bd_block_size field from struct block_device We can trivially calculate the block size from the inodes i_blkbits variable. Use that instead of keeping two redundant copies of the information in slightly different formats. Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'fs/block_dev.c') diff --git a/fs/block_dev.c b/fs/block_dev.c index 8b7a9c76d33e..06d31e459048 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -105,10 +105,7 @@ EXPORT_SYMBOL(invalidate_bdev); static void set_init_blocksize(struct block_device *bdev) { - unsigned bsize = bdev_logical_block_size(bdev); - - bdev->bd_block_size = bsize; - bdev->bd_inode->i_blkbits = blksize_bits(bsize); + bdev->bd_inode->i_blkbits = blksize_bits(bdev_logical_block_size(bdev)); } int set_blocksize(struct block_device *bdev, int size) @@ -122,9 +119,8 @@ int set_blocksize(struct block_device *bdev, int size) return -EINVAL; /* Don't change the size if it is same as current */ - if (bdev->bd_block_size != size) { + if (bdev->bd_inode->i_blkbits != blksize_bits(size)) { sync_blockdev(bdev); - bdev->bd_block_size = size; bdev->bd_inode->i_blkbits = blksize_bits(size); kill_bdev(bdev); } @@ -889,7 +885,6 @@ struct block_device *bdget(dev_t dev) bdev->bd_contains = NULL; bdev->bd_super = NULL; bdev->bd_inode = inode; - bdev->bd_block_size = i_blocksize(inode); bdev->bd_part_count = 0; bdev->bd_invalidated = 0; inode->i_mode = S_IFBLK; -- cgit From e556f6ba10f0f3c3484a1597382ceaec1e7bc700 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 26 Jun 2020 10:01:56 +0200 Subject: block: remove the bd_queue field from struct block_device Just use bd_disk->queue instead. Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'fs/block_dev.c') diff --git a/fs/block_dev.c b/fs/block_dev.c index 06d31e459048..68cb08bc1b7a 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -693,12 +693,12 @@ int bdev_read_page(struct block_device *bdev, sector_t sector, if (!ops->rw_page || bdev_get_integrity(bdev)) return result; - result = blk_queue_enter(bdev->bd_queue, 0); + result = blk_queue_enter(bdev->bd_disk->queue, 0); if (result) return result; result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, REQ_OP_READ); - blk_queue_exit(bdev->bd_queue); + blk_queue_exit(bdev->bd_disk->queue); return result; } @@ -729,7 +729,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, if (!ops->rw_page || bdev_get_integrity(bdev)) return -EOPNOTSUPP; - result = blk_queue_enter(bdev->bd_queue, 0); + result = blk_queue_enter(bdev->bd_disk->queue, 0); if (result) return result; @@ -742,7 +742,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, clean_page_buffers(page); unlock_page(page); } - blk_queue_exit(bdev->bd_queue); + blk_queue_exit(bdev->bd_disk->queue); return result; } @@ -1568,7 +1568,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (!bdev->bd_openers) { first_open = true; bdev->bd_disk = disk; - bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; bdev->bd_partno = partno; @@ -1589,7 +1588,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) disk_put_part(bdev->bd_part); bdev->bd_part = NULL; bdev->bd_disk = NULL; - bdev->bd_queue = NULL; mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); put_disk_and_module(disk); @@ -1666,7 +1664,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) disk_put_part(bdev->bd_part); bdev->bd_disk = NULL; bdev->bd_part = NULL; - bdev->bd_queue = NULL; if (bdev != bdev->bd_contains) __blkdev_put(bdev->bd_contains, mode, 1); bdev->bd_contains = NULL; -- cgit From 1008fe6dc36dd87dfd02d4307f49162f0b4f1665 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 26 Jun 2020 10:01:58 +0200 Subject: block: remove the all_bdevs list Instead just iterate over the inodes for the block device superblock. Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) (limited to 'fs/block_dev.c') diff --git a/fs/block_dev.c b/fs/block_dev.c index 68cb08bc1b7a..2d2fcb50e78e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -773,7 +773,6 @@ static void init_once(void *foo) memset(bdev, 0, sizeof(*bdev)); mutex_init(&bdev->bd_mutex); - INIT_LIST_HEAD(&bdev->bd_list); #ifdef CONFIG_SYSFS INIT_LIST_HEAD(&bdev->bd_holder_disks); #endif @@ -789,9 +788,6 @@ static void bdev_evict_inode(struct inode *inode) truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); /* is it needed here? */ clear_inode(inode); - spin_lock(&bdev_lock); - list_del_init(&bdev->bd_list); - spin_unlock(&bdev_lock); /* Detach inode from wb early as bdi_put() may free bdi->wb */ inode_detach_wb(inode); if (bdev->bd_bdi != &noop_backing_dev_info) { @@ -866,8 +862,6 @@ static int bdev_set(struct inode *inode, void *data) return 0; } -static LIST_HEAD(all_bdevs); - struct block_device *bdget(dev_t dev) { struct block_device *bdev; @@ -892,9 +886,6 @@ struct block_device *bdget(dev_t dev) inode->i_bdev = bdev; inode->i_data.a_ops = &def_blk_aops; mapping_set_gfp_mask(&inode->i_data, GFP_USER); - spin_lock(&bdev_lock); - list_add(&bdev->bd_list, &all_bdevs); - spin_unlock(&bdev_lock); unlock_new_inode(inode); } return bdev; @@ -915,13 +906,14 @@ EXPORT_SYMBOL(bdgrab); long nr_blockdev_pages(void) { - struct block_device *bdev; + struct inode *inode; long ret = 0; - spin_lock(&bdev_lock); - list_for_each_entry(bdev, &all_bdevs, bd_list) { - ret += bdev->bd_inode->i_mapping->nrpages; - } - spin_unlock(&bdev_lock); + + spin_lock(&blockdev_superblock->s_inode_list_lock); + list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) + ret += inode->i_mapping->nrpages; + spin_unlock(&blockdev_superblock->s_inode_list_lock); + return ret; } -- cgit From 9a3ffbbc6561ff878261727585ef8ee89bb36424 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 8 Jul 2020 14:25:43 +0200 Subject: block: remove flush_disk flush_disk has only two callers, so open code it there. That also helps clarifying the error message for the particular case, and allows to remove setting bd_invalidated in check_disk_size_change, which will be cleared again instantly. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 30 +++++++----------------------- 1 file changed, 7 insertions(+), 23 deletions(-) (limited to 'fs/block_dev.c') diff --git a/fs/block_dev.c b/fs/block_dev.c index 2d2fcb50e78e..a36d5b6907ea 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1347,26 +1347,6 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); #endif -/** - * flush_disk - invalidates all buffer-cache entries on a disk - * - * @bdev: struct block device to be flushed - * @kill_dirty: flag to guide handling of dirty inodes - * - * Invalidates all buffer-cache entries on a disk. It should be called - * when a disk has been changed -- either by a media change or online - * resize. - */ -static void flush_disk(struct block_device *bdev, bool kill_dirty) -{ - if (__invalidate_device(bdev, kill_dirty)) { - printk(KERN_WARNING "VFS: busy inodes on changed media or " - "resized disk %s\n", - bdev->bd_disk ? bdev->bd_disk->disk_name : ""); - } - bdev->bd_invalidated = 1; -} - /** * check_disk_size_change - checks for disk size change and adjusts bdev size. * @disk: struct gendisk to check @@ -1391,8 +1371,9 @@ static void check_disk_size_change(struct gendisk *disk, disk->disk_name, bdev_size, disk_size); } i_size_write(bdev->bd_inode, disk_size); - if (bdev_size > disk_size) - flush_disk(bdev, false); + if (bdev_size > disk_size && __invalidate_device(bdev, false)) + pr_warn("VFS: busy inodes on resized disk %s\n", + disk->disk_name); } bdev->bd_invalidated = 0; } @@ -1451,7 +1432,10 @@ int check_disk_change(struct block_device *bdev) if (!(events & DISK_EVENT_MEDIA_CHANGE)) return 0; - flush_disk(bdev, true); + if (__invalidate_device(bdev, true)) + pr_warn("VFS: busy inodes on changed media %s\n", + disk->disk_name); + bdev->bd_invalidated = 1; if (bdops->revalidate_disk) bdops->revalidate_disk(bdev->bd_disk); return 1; -- cgit From c5638ab417660e2c5d681aed5ce76df992982dbb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 16 Jul 2020 16:33:07 +0200 Subject: block: simplify the restart case in __blkdev_get Insted of duplicating all the cleanup logic jump to the code that cleans up anyway, and restart after that. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/block_dev.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'fs/block_dev.c') diff --git a/fs/block_dev.c b/fs/block_dev.c index a36d5b6907ea..376832250c8e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1517,7 +1517,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) int ret; int partno; int perm = 0; - bool first_open = false; + bool first_open = false, need_restart; if (mode & FMODE_READ) perm |= MAY_READ; @@ -1533,7 +1533,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) } restart: - + need_restart = false; ret = -ENXIO; disk = bdev_get_gendisk(bdev, &partno); if (!disk) @@ -1556,19 +1556,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) ret = 0; if (disk->fops->open) { ret = disk->fops->open(bdev, mode); - if (ret == -ERESTARTSYS) { - /* Lost a race with 'disk' being - * deleted, try again. - * See md.c - */ - disk_put_part(bdev->bd_part); - bdev->bd_part = NULL; - bdev->bd_disk = NULL; - mutex_unlock(&bdev->bd_mutex); - disk_unblock_events(disk); - put_disk_and_module(disk); - goto restart; - } + /* + * If we lost a race with 'disk' being deleted, + * try again. See md.c + */ + if (ret == -ERESTARTSYS) + need_restart = true; } if (!ret) { @@ -1647,6 +1640,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); put_disk_and_module(disk); + if (need_restart) + goto restart; out: return ret; -- cgit From 58e46ed9cc634b3c64ea402789f8a92ce45cee48 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 16 Jul 2020 16:33:08 +0200 Subject: block: refactor bd_start_claiming Move the locking and assignment of bd_claiming from bd_start_claiming to bd_prepare_to_claim. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/block_dev.c | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) (limited to 'fs/block_dev.c') diff --git a/fs/block_dev.c b/fs/block_dev.c index 376832250c8e..b7b2ee4b288a 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1015,19 +1015,14 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, } /** - * bd_prepare_to_claim - prepare to claim a block device + * bd_prepare_to_claim - claim a block device * @bdev: block device of interest * @whole: the whole device containing @bdev, may equal @bdev * @holder: holder trying to claim @bdev * - * Prepare to claim @bdev. This function fails if @bdev is already - * claimed by another holder and waits if another claiming is in - * progress. This function doesn't actually claim. On successful - * return, the caller has ownership of bd_claiming and bd_holder[s]. - * - * CONTEXT: - * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab - * it multiple times. + * Claim @bdev. This function fails if @bdev is already claimed by another + * holder and waits if another claiming is in progress. return, the caller + * has ownership of bd_claiming and bd_holder[s]. * * RETURNS: * 0 if @bdev can be claimed, -EBUSY otherwise. @@ -1036,9 +1031,12 @@ static int bd_prepare_to_claim(struct block_device *bdev, struct block_device *whole, void *holder) { retry: + spin_lock(&bdev_lock); /* if someone else claimed, fail */ - if (!bd_may_claim(bdev, whole, holder)) + if (!bd_may_claim(bdev, whole, holder)) { + spin_unlock(&bdev_lock); return -EBUSY; + } /* if claiming is already in progress, wait for it to finish */ if (whole->bd_claiming) { @@ -1049,11 +1047,12 @@ retry: spin_unlock(&bdev_lock); schedule(); finish_wait(wq, &wait); - spin_lock(&bdev_lock); goto retry; } /* yay, all mine */ + whole->bd_claiming = holder; + spin_unlock(&bdev_lock); return 0; } @@ -1134,19 +1133,13 @@ struct block_device *bd_start_claiming(struct block_device *bdev, void *holder) if (!whole) return ERR_PTR(-ENOMEM); - /* prepare to claim, if successful, mark claiming in progress */ - spin_lock(&bdev_lock); - err = bd_prepare_to_claim(bdev, whole, holder); - if (err == 0) { - whole->bd_claiming = holder; - spin_unlock(&bdev_lock); - return whole; - } else { - spin_unlock(&bdev_lock); + if (err) { bdput(whole); return ERR_PTR(err); } + + return whole; } EXPORT_SYMBOL(bd_start_claiming); -- cgit From ecbe6bc0003bfd5bf8581cb679cae0eb944432cb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 16 Jul 2020 16:33:09 +0200 Subject: block: use bd_prepare_to_claim directly in the loop driver The arcane magic in bd_start_claiming is only needed to be able to claim a block_device that hasn't been fully set up. Switch the loop driver that claims from the ioctl path with a fully set up struct block_device to just use the much simpler bd_prepare_to_claim directly. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/block_dev.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs/block_dev.c') diff --git a/fs/block_dev.c b/fs/block_dev.c index b7b2ee4b288a..ee80bd81af74 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1027,8 +1027,8 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, * RETURNS: * 0 if @bdev can be claimed, -EBUSY otherwise. */ -static int bd_prepare_to_claim(struct block_device *bdev, - struct block_device *whole, void *holder) +int bd_prepare_to_claim(struct block_device *bdev, struct block_device *whole, + void *holder) { retry: spin_lock(&bdev_lock); @@ -1055,6 +1055,7 @@ retry: spin_unlock(&bdev_lock); return 0; } +EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */ static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno) { @@ -1100,7 +1101,8 @@ static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno) * Pointer to the block device containing @bdev on success, ERR_PTR() * value on failure. */ -struct block_device *bd_start_claiming(struct block_device *bdev, void *holder) +static struct block_device *bd_start_claiming(struct block_device *bdev, + void *holder) { struct gendisk *disk; struct block_device *whole; @@ -1141,7 +1143,6 @@ struct block_device *bd_start_claiming(struct block_device *bdev, void *holder) return whole; } -EXPORT_SYMBOL(bd_start_claiming); static void bd_clear_claiming(struct block_device *whole, void *holder) { -- cgit From 5b642d8b9f2ef1212f33dfc337816e6b4ab83bb3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 16 Jul 2020 16:33:10 +0200 Subject: block: integrate bd_start_claiming into __blkdev_get bd_start_claiming duplicates a lot of the work done in __blkdev_get. Integrate the two functions to avoid the duplicate work, and to do the right thing for the md -ERESTARTSYS corner case. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/block_dev.c | 177 ++++++++++++++++++--------------------------------------- 1 file changed, 55 insertions(+), 122 deletions(-) (limited to 'fs/block_dev.c') diff --git a/fs/block_dev.c b/fs/block_dev.c index ee80bd81af74..3f94a06a0946 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1078,72 +1078,6 @@ static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno) return disk; } -/** - * bd_start_claiming - start claiming a block device - * @bdev: block device of interest - * @holder: holder trying to claim @bdev - * - * @bdev is about to be opened exclusively. Check @bdev can be opened - * exclusively and mark that an exclusive open is in progress. Each - * successful call to this function must be matched with a call to - * either bd_finish_claiming() or bd_abort_claiming() (which do not - * fail). - * - * This function is used to gain exclusive access to the block device - * without actually causing other exclusive open attempts to fail. It - * should be used when the open sequence itself requires exclusive - * access but may subsequently fail. - * - * CONTEXT: - * Might sleep. - * - * RETURNS: - * Pointer to the block device containing @bdev on success, ERR_PTR() - * value on failure. - */ -static struct block_device *bd_start_claiming(struct block_device *bdev, - void *holder) -{ - struct gendisk *disk; - struct block_device *whole; - int partno, err; - - might_sleep(); - - /* - * @bdev might not have been initialized properly yet, look up - * and grab the outer block device the hard way. - */ - disk = bdev_get_gendisk(bdev, &partno); - if (!disk) - return ERR_PTR(-ENXIO); - - /* - * Normally, @bdev should equal what's returned from bdget_disk() - * if partno is 0; however, some drivers (floppy) use multiple - * bdev's for the same physical device and @bdev may be one of the - * aliases. Keep @bdev if partno is 0. This means claimer - * tracking is broken for those devices but it has always been that - * way. - */ - if (partno) - whole = bdget_disk(disk, 0); - else - whole = bdgrab(bdev); - - put_disk_and_module(disk); - if (!whole) - return ERR_PTR(-ENOMEM); - - err = bd_prepare_to_claim(bdev, whole, holder); - if (err) { - bdput(whole); - return ERR_PTR(err); - } - - return whole; -} - static void bd_clear_claiming(struct block_device *whole, void *holder) { lockdep_assert_held(&bdev_lock); @@ -1156,7 +1090,7 @@ static void bd_clear_claiming(struct block_device *whole, void *holder) /** * bd_finish_claiming - finish claiming of a block device * @bdev: block device of interest - * @whole: whole block device (returned from bd_start_claiming()) + * @whole: whole block device * @holder: holder that has claimed @bdev * * Finish exclusive open of a block device. Mark the device as exlusively @@ -1182,7 +1116,7 @@ static void bd_finish_claiming(struct block_device *bdev, /** * bd_abort_claiming - abort claiming of a block device * @bdev: block device of interest - * @whole: whole block device (returned from bd_start_claiming()) + * @whole: whole block device * @holder: holder that has claimed @bdev * * Abort claiming of a block device when the exclusive open failed. This can be @@ -1505,13 +1439,15 @@ EXPORT_SYMBOL_GPL(bdev_disk_changed); * mutex_lock_nested(whole->bd_mutex, 1) */ -static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) +static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, + int for_part) { + struct block_device *whole = NULL, *claiming = NULL; struct gendisk *disk; int ret; int partno; int perm = 0; - bool first_open = false, need_restart; + bool first_open = false, unblock_events = true, need_restart; if (mode & FMODE_READ) perm |= MAY_READ; @@ -1533,6 +1469,25 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (!disk) goto out; + if (partno) { + whole = bdget_disk(disk, 0); + if (!whole) { + ret = -ENOMEM; + goto out_put_disk; + } + } + + if (!for_part && (mode & FMODE_EXCL)) { + WARN_ON_ONCE(!holder); + if (whole) + claiming = whole; + else + claiming = bdev; + ret = bd_prepare_to_claim(bdev, claiming, holder); + if (ret) + goto out_put_whole; + } + disk_block_events(disk); mutex_lock_nested(&bdev->bd_mutex, for_part); if (!bdev->bd_openers) { @@ -1576,18 +1531,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (ret) goto out_clear; } else { - struct block_device *whole; - whole = bdget_disk(disk, 0); - ret = -ENOMEM; - if (!whole) - goto out_clear; BUG_ON(for_part); - ret = __blkdev_get(whole, mode, 1); - if (ret) { - bdput(whole); + ret = __blkdev_get(whole, mode, NULL, 1); + if (ret) goto out_clear; - } - bdev->bd_contains = whole; + bdev->bd_contains = bdgrab(whole); bdev->bd_part = disk_get_part(disk, partno); if (!(disk->flags & GENHD_FL_UP) || !bdev->bd_part || !bdev->bd_part->nr_sects) { @@ -1616,11 +1564,30 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_openers++; if (for_part) bdev->bd_part_count++; + if (claiming) + bd_finish_claiming(bdev, claiming, holder); + + /* + * Block event polling for write claims if requested. Any write holder + * makes the write_holder state stick until all are released. This is + * good enough and tracking individual writeable reference is too + * fragile given the way @mode is used in blkdev_get/put(). + */ + if (claiming && (mode & FMODE_WRITE) && !bdev->bd_write_holder && + (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) { + bdev->bd_write_holder = true; + unblock_events = false; + } mutex_unlock(&bdev->bd_mutex); - disk_unblock_events(disk); + + if (unblock_events) + disk_unblock_events(disk); + /* only one opener holds refs to the module and disk */ if (!first_open) put_disk_and_module(disk); + if (whole) + bdput(whole); return 0; out_clear: @@ -1631,13 +1598,18 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) __blkdev_put(bdev->bd_contains, mode, 1); bdev->bd_contains = NULL; out_unlock_bdev: + if (claiming) + bd_abort_claiming(bdev, claiming, holder); mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); + out_put_whole: + if (whole) + bdput(whole); + out_put_disk: put_disk_and_module(disk); if (need_restart) goto restart; out: - return ret; } @@ -1662,50 +1634,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) */ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) { - struct block_device *whole = NULL; int res; - WARN_ON_ONCE((mode & FMODE_EXCL) && !holder); - - if ((mode & FMODE_EXCL) && holder) { - whole = bd_start_claiming(bdev, holder); - if (IS_ERR(whole)) { - bdput(bdev); - return PTR_ERR(whole); - } - } - - res = __blkdev_get(bdev, mode, 0); - - if (whole) { - struct gendisk *disk = whole->bd_disk; - - /* finish claiming */ - mutex_lock(&bdev->bd_mutex); - if (!res) - bd_finish_claiming(bdev, whole, holder); - else - bd_abort_claiming(bdev, whole, holder); - /* - * Block event polling for write claims if requested. Any - * write holder makes the write_holder state stick until - * all are released. This is good enough and tracking - * individual writeable reference is too fragile given the - * way @mode is used in blkdev_get/put(). - */ - if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder && - (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) { - bdev->bd_write_holder = true; - disk_block_events(disk); - } - - mutex_unlock(&bdev->bd_mutex); - bdput(whole); - } - + res =__blkdev_get(bdev, mode, holder, 0); if (res) bdput(bdev); - return res; } EXPORT_SYMBOL(blkdev_get); -- cgit