diff options
Diffstat (limited to 'fs')
93 files changed, 1375 insertions, 1492 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 59c32c9b799f..c4a2dc41beac 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -121,10 +121,6 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl) p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl); - /* No mandatory locks */ - if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) - return -ENOLCK; - if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { filemap_write_and_wait(inode->i_mapping); invalidate_mapping_pages(&inode->i_data, 0, -1); @@ -312,10 +308,6 @@ static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl) p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n", filp, cmd, fl, filp); - /* No mandatory locks */ - if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) - goto out_err; - if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { filemap_write_and_wait(inode->i_mapping); invalidate_mapping_pages(&inode->i_data, 0, -1); @@ -327,7 +319,6 @@ static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl) ret = v9fs_file_getlock(filp, fl); else ret = -EINVAL; -out_err: return ret; } @@ -348,10 +339,6 @@ static int v9fs_file_flock_dotl(struct file *filp, int cmd, p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n", filp, cmd, fl, filp); - /* No mandatory locks */ - if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) - goto out_err; - if (!(fl->fl_flags & FL_FLOCK)) goto out_err; diff --git a/fs/Kconfig b/fs/Kconfig index a7749c126b8e..949128bf86c9 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -101,16 +101,6 @@ config FILE_LOCKING for filesystems like NFS and for the flock() system call. Disabling this option saves about 11k. -config MANDATORY_FILE_LOCKING - bool "Enable Mandatory file locking" - depends on FILE_LOCKING - default y - help - This option enables files appropriately marked files on appropriely - mounted filesystems to support mandatory locking. - - To the best of my knowledge this is dead code that no one cares about. - source "fs/crypto/Kconfig" source "fs/verity/Kconfig" diff --git a/fs/afs/flock.c b/fs/afs/flock.c index cb3054c7843e..c4210a3964d8 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -772,10 +772,6 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl) fl->fl_type, fl->fl_flags, (long long) fl->fl_start, (long long) fl->fl_end); - /* AFS doesn't support mandatory locks */ - if (__mandatory_lock(&vnode->vfs_inode) && fl->fl_type != F_UNLCK) - return -ENOLCK; - if (IS_GETLK(cmd)) return afs_do_getlk(file, fl); @@ -1695,7 +1695,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, list_del(&iocb->ki_list); iocb->ki_res.res = mangle_poll(mask); req->done = true; - if (iocb->ki_eventfd && eventfd_signal_count()) { + if (iocb->ki_eventfd && eventfd_signal_allowed()) { iocb = NULL; INIT_WORK(&req->work, aio_poll_put_work); schedule_work(&req->work); diff --git a/fs/block_dev.c b/fs/block_dev.c index 3c7fb7106713..45df6cbccf12 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -35,6 +35,7 @@ #include <linux/uaccess.h> #include <linux/suspend.h> #include "internal.h" +#include "../block/blk.h" struct bdev_inode { struct block_device bdev; @@ -688,7 +689,8 @@ static loff_t block_llseek(struct file *file, loff_t offset, int whence) return retval; } -int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) +static int blkdev_fsync(struct file *filp, loff_t start, loff_t end, + int datasync) { struct inode *bd_inode = bdev_file_inode(filp); struct block_device *bdev = I_BDEV(bd_inode); @@ -709,7 +711,6 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) return error; } -EXPORT_SYMBOL(blkdev_fsync); /** * bdev_read_page() - Start reading a page from a block device @@ -803,7 +804,6 @@ static struct inode *bdev_alloc_inode(struct super_block *sb) if (!ei) return NULL; memset(&ei->bdev, 0, sizeof(ei->bdev)); - ei->bdev.bd_bdi = &noop_backing_dev_info; return &ei->vfs_inode; } @@ -814,8 +814,15 @@ static void bdev_free_inode(struct inode *inode) free_percpu(bdev->bd_stats); kfree(bdev->bd_meta_info); - if (!bdev_is_partition(bdev)) + if (!bdev_is_partition(bdev)) { + if (bdev->bd_disk && bdev->bd_disk->bdi) + bdi_put(bdev->bd_disk->bdi); kfree(bdev->bd_disk); + } + + if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR) + blk_free_ext_minor(MINOR(bdev->bd_dev)); + kmem_cache_free(bdev_cachep, BDEV_I(inode)); } @@ -828,16 +835,9 @@ static void init_once(void *data) static void bdev_evict_inode(struct inode *inode) { - struct block_device *bdev = &BDEV_I(inode)->bdev; truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); /* is it needed here? */ clear_inode(inode); - /* Detach inode from wb early as bdi_put() may free bdi->wb */ - inode_detach_wb(inode); - if (bdev->bd_bdi != &noop_backing_dev_info) { - bdi_put(bdev->bd_bdi); - bdev->bd_bdi = &noop_backing_dev_info; - } } static const struct super_operations bdev_sops = { @@ -904,9 +904,6 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) bdev->bd_disk = disk; bdev->bd_partno = partno; bdev->bd_inode = inode; -#ifdef CONFIG_SYSFS - INIT_LIST_HEAD(&bdev->bd_holder_disks); -#endif bdev->bd_stats = alloc_percpu(struct disk_stats); if (!bdev->bd_stats) { iput(inode); @@ -923,31 +920,6 @@ void bdev_add(struct block_device *bdev, dev_t dev) insert_inode_hash(bdev->bd_inode); } -static struct block_device *bdget(dev_t dev) -{ - struct inode *inode; - - inode = ilookup(blockdev_superblock, dev); - if (!inode) - return NULL; - return &BDEV_I(inode)->bdev; -} - -/** - * bdgrab -- Grab a reference to an already referenced block device - * @bdev: Block device to grab a reference to. - * - * Returns the block_device with an additional reference when successful, - * or NULL if the inode is already beeing freed. - */ -struct block_device *bdgrab(struct block_device *bdev) -{ - if (!igrab(bdev->bd_inode)) - return NULL; - return bdev; -} -EXPORT_SYMBOL(bdgrab); - long nr_blockdev_pages(void) { struct inode *inode; @@ -961,12 +933,6 @@ long nr_blockdev_pages(void) return ret; } -void bdput(struct block_device *bdev) -{ - iput(bdev->bd_inode); -} -EXPORT_SYMBOL(bdput); - /** * bd_may_claim - test whether a block device can be claimed * @bdev: block device of interest @@ -1096,148 +1062,6 @@ void bd_abort_claiming(struct block_device *bdev, void *holder) } EXPORT_SYMBOL(bd_abort_claiming); -#ifdef CONFIG_SYSFS -struct bd_holder_disk { - struct list_head list; - struct gendisk *disk; - int refcnt; -}; - -static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, - struct gendisk *disk) -{ - struct bd_holder_disk *holder; - - list_for_each_entry(holder, &bdev->bd_holder_disks, list) - if (holder->disk == disk) - return holder; - return NULL; -} - -static int add_symlink(struct kobject *from, struct kobject *to) -{ - return sysfs_create_link(from, to, kobject_name(to)); -} - -static void del_symlink(struct kobject *from, struct kobject *to) -{ - sysfs_remove_link(from, kobject_name(to)); -} - -/** - * bd_link_disk_holder - create symlinks between holding disk and slave bdev - * @bdev: the claimed slave bdev - * @disk: the holding disk - * - * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. - * - * This functions creates the following sysfs symlinks. - * - * - from "slaves" directory of the holder @disk to the claimed @bdev - * - from "holders" directory of the @bdev to the holder @disk - * - * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is - * passed to bd_link_disk_holder(), then: - * - * /sys/block/dm-0/slaves/sda --> /sys/block/sda - * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 - * - * The caller must have claimed @bdev before calling this function and - * ensure that both @bdev and @disk are valid during the creation and - * lifetime of these symlinks. - * - * CONTEXT: - * Might sleep. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) -{ - struct bd_holder_disk *holder; - int ret = 0; - - mutex_lock(&bdev->bd_disk->open_mutex); - - WARN_ON_ONCE(!bdev->bd_holder); - - /* FIXME: remove the following once add_disk() handles errors */ - if (WARN_ON(!disk->slave_dir || !bdev->bd_holder_dir)) - goto out_unlock; - - holder = bd_find_holder_disk(bdev, disk); - if (holder) { - holder->refcnt++; - goto out_unlock; - } - - holder = kzalloc(sizeof(*holder), GFP_KERNEL); - if (!holder) { - ret = -ENOMEM; - goto out_unlock; - } - - INIT_LIST_HEAD(&holder->list); - holder->disk = disk; - holder->refcnt = 1; - - ret = add_symlink(disk->slave_dir, bdev_kobj(bdev)); - if (ret) - goto out_free; - - ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); - if (ret) - goto out_del; - /* - * bdev could be deleted beneath us which would implicitly destroy - * the holder directory. Hold on to it. - */ - kobject_get(bdev->bd_holder_dir); - - list_add(&holder->list, &bdev->bd_holder_disks); - goto out_unlock; - -out_del: - del_symlink(disk->slave_dir, bdev_kobj(bdev)); -out_free: - kfree(holder); -out_unlock: - mutex_unlock(&bdev->bd_disk->open_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(bd_link_disk_holder); - -/** - * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() - * @bdev: the calimed slave bdev - * @disk: the holding disk - * - * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. - * - * CONTEXT: - * Might sleep. - */ -void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) -{ - struct bd_holder_disk *holder; - - mutex_lock(&bdev->bd_disk->open_mutex); - - holder = bd_find_holder_disk(bdev, disk); - - if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { - del_symlink(disk->slave_dir, bdev_kobj(bdev)); - del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); - kobject_put(bdev->bd_holder_dir); - list_del_init(&holder->list); - kfree(holder); - } - - mutex_unlock(&bdev->bd_disk->open_mutex); -} -EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); -#endif - static void blkdev_flush_mapping(struct block_device *bdev) { WARN_ON_ONCE(bdev->bd_holders); @@ -1262,11 +1086,8 @@ static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) } } - if (!bdev->bd_openers) { + if (!bdev->bd_openers) set_init_blocksize(bdev); - if (bdev->bd_bdi == &noop_backing_dev_info) - bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info); - } if (test_bit(GD_NEED_PART_SCAN, &disk->state)) bdev_disk_changed(disk, false); bdev->bd_openers++; @@ -1284,16 +1105,14 @@ static void blkdev_put_whole(struct block_device *bdev, fmode_t mode) static int blkdev_get_part(struct block_device *part, fmode_t mode) { struct gendisk *disk = part->bd_disk; - struct block_device *whole; int ret; if (part->bd_openers) goto done; - whole = bdgrab(disk->part0); - ret = blkdev_get_whole(whole, mode); + ret = blkdev_get_whole(bdev_whole(part), mode); if (ret) - goto out_put_whole; + return ret; ret = -ENXIO; if (!bdev_nr_sectors(part)) @@ -1301,16 +1120,12 @@ static int blkdev_get_part(struct block_device *part, fmode_t mode) disk->open_partitions++; set_init_blocksize(part); - if (part->bd_bdi == &noop_backing_dev_info) - part->bd_bdi = bdi_get(disk->queue->backing_dev_info); done: part->bd_openers++; return 0; out_blkdev_put: - blkdev_put_whole(whole, mode); -out_put_whole: - bdput(whole); + blkdev_put_whole(bdev_whole(part), mode); return ret; } @@ -1323,42 +1138,42 @@ static void blkdev_put_part(struct block_device *part, fmode_t mode) blkdev_flush_mapping(part); whole->bd_disk->open_partitions--; blkdev_put_whole(whole, mode); - bdput(whole); } struct block_device *blkdev_get_no_open(dev_t dev) { struct block_device *bdev; - struct gendisk *disk; + struct inode *inode; - bdev = bdget(dev); - if (!bdev) { + inode = ilookup(blockdev_superblock, dev); + if (!inode) { blk_request_module(dev); - bdev = bdget(dev); - if (!bdev) + inode = ilookup(blockdev_superblock, dev); + if (!inode) return NULL; } - disk = bdev->bd_disk; - if (!kobject_get_unless_zero(&disk_to_dev(disk)->kobj)) - goto bdput; - if ((disk->flags & (GENHD_FL_UP | GENHD_FL_HIDDEN)) != GENHD_FL_UP) - goto put_disk; - if (!try_module_get(bdev->bd_disk->fops->owner)) - goto put_disk; + /* switch from the inode reference to a device mode one: */ + bdev = &BDEV_I(inode)->bdev; + if (!kobject_get_unless_zero(&bdev->bd_device.kobj)) + bdev = NULL; + iput(inode); + + if (!bdev) + return NULL; + if ((bdev->bd_disk->flags & GENHD_FL_HIDDEN) || + !try_module_get(bdev->bd_disk->fops->owner)) { + put_device(&bdev->bd_device); + return NULL; + } + return bdev; -put_disk: - put_disk(disk); -bdput: - bdput(bdev); - return NULL; } void blkdev_put_no_open(struct block_device *bdev) { module_put(bdev->bd_disk->fops->owner); - put_disk(bdev->bd_disk); - bdput(bdev); + put_device(&bdev->bd_device); } /** @@ -1411,7 +1226,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) mutex_lock(&disk->open_mutex); ret = -ENXIO; - if (!(disk->flags & GENHD_FL_UP)) + if (!disk_live(disk)) goto abort_claiming; if (bdev_is_partition(bdev)) ret = blkdev_get_part(bdev, mode); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 06f9f167222b..bd5689fa290e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -629,7 +629,7 @@ again: * inode has not been flagged as nocompress. This flag can * change at any time if we discover bad compression ratios. */ - if (nr_pages > 1 && inode_need_compress(BTRFS_I(inode), start, end)) { + if (inode_need_compress(BTRFS_I(inode), start, end)) { WARN_ON(pages); pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) { diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index a1e2813731d1..7e7a897ae0d3 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1395,9 +1395,11 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) ret = VM_FAULT_SIGBUS; } else { struct address_space *mapping = inode->i_mapping; - struct page *page = find_or_create_page(mapping, 0, - mapping_gfp_constraint(mapping, - ~__GFP_FS)); + struct page *page; + + filemap_invalidate_lock_shared(mapping); + page = find_or_create_page(mapping, 0, + mapping_gfp_constraint(mapping, ~__GFP_FS)); if (!page) { ret = VM_FAULT_OOM; goto out_inline; @@ -1418,6 +1420,7 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) vmf->page = page; ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; out_inline: + filemap_invalidate_unlock_shared(mapping); dout("filemap_fault %p %llu read inline data ret %x\n", inode, off, ret); } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 2a2900903f8c..39db97f149b9 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1743,7 +1743,11 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, struct ceph_cap_flush *ceph_alloc_cap_flush(void) { - return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL); + struct ceph_cap_flush *cf; + + cf = kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL); + cf->is_capsnap = false; + return cf; } void ceph_free_cap_flush(struct ceph_cap_flush *cf) @@ -1778,7 +1782,7 @@ static bool __detach_cap_flush_from_mdsc(struct ceph_mds_client *mdsc, prev->wake = true; wake = false; } - list_del(&cf->g_list); + list_del_init(&cf->g_list); return wake; } @@ -1793,7 +1797,7 @@ static bool __detach_cap_flush_from_ci(struct ceph_inode_info *ci, prev->wake = true; wake = false; } - list_del(&cf->i_list); + list_del_init(&cf->i_list); return wake; } @@ -2352,7 +2356,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc, ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) { - if (!cf->caps) { + if (cf->is_capsnap) { last_snap_flush = cf->tid; break; } @@ -2371,7 +2375,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc, first_tid = cf->tid + 1; - if (cf->caps) { + if (!cf->is_capsnap) { struct cap_msg_args arg; dout("kick_flushing_caps %p cap %p tid %llu %s\n", @@ -3516,7 +3520,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, cleaned = cf->caps; /* Is this a capsnap? */ - if (cf->caps == 0) + if (cf->is_capsnap) continue; if (cf->tid <= flush_tid) { @@ -3589,8 +3593,9 @@ out: while (!list_empty(&to_remove)) { cf = list_first_entry(&to_remove, struct ceph_cap_flush, i_list); - list_del(&cf->i_list); - ceph_free_cap_flush(cf); + list_del_init(&cf->i_list); + if (!cf->is_capsnap) + ceph_free_cap_flush(cf); } if (wake_ci) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index d1755ac1d964..e1d605a02d4a 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -2088,6 +2088,7 @@ static long ceph_fallocate(struct file *file, int mode, if (ret < 0) goto unlock; + filemap_invalidate_lock(inode->i_mapping); ceph_zero_pagecache_range(inode, offset, length); ret = ceph_zero_objects(inode, offset, length); @@ -2100,6 +2101,7 @@ static long ceph_fallocate(struct file *file, int mode, if (dirty) __mark_inode_dirty(inode, dirty); } + filemap_invalidate_unlock(inode->i_mapping); ceph_put_cap_refs(ci, got); unlock: diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index fa8a847743d0..bdeb271f47d9 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -240,9 +240,6 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_POSIX)) return -ENOLCK; - /* No mandatory locks */ - if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK) - return -ENOLCK; dout("ceph_lock, fl_owner: %p\n", fl->fl_owner); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index afdc20213876..0b69aec23e5c 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1616,7 +1616,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, spin_lock(&mdsc->cap_dirty_lock); list_for_each_entry(cf, &to_remove, i_list) - list_del(&cf->g_list); + list_del_init(&cf->g_list); if (!list_empty(&ci->i_dirty_item)) { pr_warn_ratelimited( @@ -1668,8 +1668,9 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, struct ceph_cap_flush *cf; cf = list_first_entry(&to_remove, struct ceph_cap_flush, i_list); - list_del(&cf->i_list); - ceph_free_cap_flush(cf); + list_del_init(&cf->i_list); + if (!cf->is_capsnap) + ceph_free_cap_flush(cf); } wake_up_all(&ci->i_cap_wq); diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index abd9af7727ad..3c444b9cb17b 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -394,9 +394,11 @@ void ceph_mdsmap_destroy(struct ceph_mdsmap *m) { int i; - for (i = 0; i < m->possible_max_rank; i++) - kfree(m->m_info[i].export_targets); - kfree(m->m_info); + if (m->m_info) { + for (i = 0; i < m->possible_max_rank; i++) + kfree(m->m_info[i].export_targets); + kfree(m->m_info); + } kfree(m->m_data_pg_pools); kfree(m); } diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 4c6bd1042c94..15105f9da3fd 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -487,6 +487,9 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci) pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode); return; } + capsnap->cap_flush.is_capsnap = true; + INIT_LIST_HEAD(&capsnap->cap_flush.i_list); + INIT_LIST_HEAD(&capsnap->cap_flush.g_list); spin_lock(&ci->i_ceph_lock); used = __ceph_caps_used(ci); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 9215a2f4535c..b1a363641beb 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -182,8 +182,9 @@ struct ceph_cap { struct ceph_cap_flush { u64 tid; - int caps; /* 0 means capsnap */ + int caps; bool wake; /* wake up flush waiters when finish ? */ + bool is_capsnap; /* true means capsnap */ struct list_head g_list; // global struct list_head i_list; // per inode }; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 2dfd0d8297eb..ddc0e8f97872 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -3590,6 +3590,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, return rc; } + filemap_invalidate_lock(inode->i_mapping); /* * We implement the punch hole through ioctl, so we need remove the page * caches first, otherwise the data may be inconsistent with the server. @@ -3607,6 +3608,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, sizeof(struct file_zero_data_information), CIFSMaxBufSize, NULL, NULL); free_xid(xid); + filemap_invalidate_unlock(inode->i_mapping); return rc; } diff --git a/fs/eventfd.c b/fs/eventfd.c index e265b6dd4f34..3627dd7d25db 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -25,8 +25,6 @@ #include <linux/idr.h> #include <linux/uio.h> -DEFINE_PER_CPU(int, eventfd_wake_count); - static DEFINE_IDA(eventfd_ida); struct eventfd_ctx { @@ -67,21 +65,21 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) * Deadlock or stack overflow issues can happen if we recurse here * through waitqueue wakeup handlers. If the caller users potentially * nested waitqueues with custom wakeup handlers, then it should - * check eventfd_signal_count() before calling this function. If - * it returns true, the eventfd_signal() call should be deferred to a + * check eventfd_signal_allowed() before calling this function. If + * it returns false, the eventfd_signal() call should be deferred to a * safe context. */ - if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count))) + if (WARN_ON_ONCE(current->in_eventfd_signal)) return 0; spin_lock_irqsave(&ctx->wqh.lock, flags); - this_cpu_inc(eventfd_wake_count); + current->in_eventfd_signal = 1; if (ULLONG_MAX - ctx->count < n) n = ULLONG_MAX - ctx->count; ctx->count += n; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLIN); - this_cpu_dec(eventfd_wake_count); + current->in_eventfd_signal = 0; spin_unlock_irqrestore(&ctx->wqh.lock, flags); return n; diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig index 54eec9185627..1248ff4ef562 100644 --- a/fs/ext2/Kconfig +++ b/fs/ext2/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config EXT2_FS tristate "Second extended fs support" + select FS_IOMAP help Ext2 is a standard Linux file system for hard disks. diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index e512630cb63e..3be9dd6412b7 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -667,9 +667,6 @@ struct ext2_inode_info { struct rw_semaphore xattr_sem; #endif rwlock_t i_meta_lock; -#ifdef CONFIG_FS_DAX - struct rw_semaphore dax_sem; -#endif /* * truncate_mutex is for serialising ext2_truncate() against @@ -685,14 +682,6 @@ struct ext2_inode_info { #endif }; -#ifdef CONFIG_FS_DAX -#define dax_sem_down_write(ext2_inode) down_write(&(ext2_inode)->dax_sem) -#define dax_sem_up_write(ext2_inode) up_write(&(ext2_inode)->dax_sem) -#else -#define dax_sem_down_write(ext2_inode) -#define dax_sem_up_write(ext2_inode) -#endif - /* * Inode dynamic state flags */ diff --git a/fs/ext2/file.c b/fs/ext2/file.c index f98466acc672..eb97aa3d700e 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -81,7 +81,7 @@ out_unlock: * * mmap_lock (MM) * sb_start_pagefault (vfs, freeze) - * ext2_inode_info->dax_sem + * address_space->invalidate_lock * address_space->i_mmap_rwsem or page_lock (mutually exclusive in DAX) * ext2_inode_info->truncate_mutex * @@ -91,7 +91,6 @@ out_unlock: static vm_fault_t ext2_dax_fault(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); - struct ext2_inode_info *ei = EXT2_I(inode); vm_fault_t ret; bool write = (vmf->flags & FAULT_FLAG_WRITE) && (vmf->vma->vm_flags & VM_SHARED); @@ -100,11 +99,11 @@ static vm_fault_t ext2_dax_fault(struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); file_update_time(vmf->vma->vm_file); } - down_read(&ei->dax_sem); + filemap_invalidate_lock_shared(inode->i_mapping); ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, NULL, &ext2_iomap_ops); - up_read(&ei->dax_sem); + filemap_invalidate_unlock_shared(inode->i_mapping); if (write) sb_end_pagefault(inode->i_sb); return ret; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index dadb121beb22..333fa62661d5 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -799,7 +799,6 @@ int ext2_get_block(struct inode *inode, sector_t iblock, } -#ifdef CONFIG_FS_DAX static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap) { @@ -852,16 +851,18 @@ const struct iomap_ops ext2_iomap_ops = { .iomap_begin = ext2_iomap_begin, .iomap_end = ext2_iomap_end, }; -#else -/* Define empty ops for !CONFIG_FS_DAX case to avoid ugly ifdefs */ -const struct iomap_ops ext2_iomap_ops; -#endif /* CONFIG_FS_DAX */ int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { - return generic_block_fiemap(inode, fieinfo, start, len, - ext2_get_block); + int ret; + + inode_lock(inode); + len = min_t(u64, len, i_size_read(inode)); + ret = iomap_fiemap(inode, fieinfo, start, len, &ext2_iomap_ops); + inode_unlock(inode); + + return ret; } static int ext2_writepage(struct page *page, struct writeback_control *wbc) @@ -1177,7 +1178,7 @@ static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int de ext2_free_data(inode, p, q); } -/* dax_sem must be held when calling this function */ +/* mapping->invalidate_lock must be held when calling this function */ static void __ext2_truncate_blocks(struct inode *inode, loff_t offset) { __le32 *i_data = EXT2_I(inode)->i_data; @@ -1194,7 +1195,7 @@ static void __ext2_truncate_blocks(struct inode *inode, loff_t offset) iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb); #ifdef CONFIG_FS_DAX - WARN_ON(!rwsem_is_locked(&ei->dax_sem)); + WARN_ON(!rwsem_is_locked(&inode->i_mapping->invalidate_lock)); #endif n = ext2_block_to_path(inode, iblock, offsets, NULL); @@ -1276,9 +1277,9 @@ static void ext2_truncate_blocks(struct inode *inode, loff_t offset) if (ext2_inode_is_fast_symlink(inode)) return; - dax_sem_down_write(EXT2_I(inode)); + filemap_invalidate_lock(inode->i_mapping); __ext2_truncate_blocks(inode, offset); - dax_sem_up_write(EXT2_I(inode)); + filemap_invalidate_unlock(inode->i_mapping); } static int ext2_setsize(struct inode *inode, loff_t newsize) @@ -1308,10 +1309,10 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) if (error) return error; - dax_sem_down_write(EXT2_I(inode)); + filemap_invalidate_lock(inode->i_mapping); truncate_setsize(inode, newsize); __ext2_truncate_blocks(inode, newsize); - dax_sem_up_write(EXT2_I(inode)); + filemap_invalidate_unlock(inode->i_mapping); inode->i_mtime = inode->i_ctime = current_time(inode); if (inode_needs_sync(inode)) { diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 21e09fbaa46f..987bcf32ed46 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -206,9 +206,6 @@ static void init_once(void *foo) init_rwsem(&ei->xattr_sem); #endif mutex_init(&ei->truncate_mutex); -#ifdef CONFIG_FS_DAX - init_rwsem(&ei->dax_sem); -#endif inode_init_once(&ei->vfs_inode); } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3c51e243450d..7ebaf66b6e31 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1086,15 +1086,6 @@ struct ext4_inode_info { * by other means, so we have i_data_sem. */ struct rw_semaphore i_data_sem; - /* - * i_mmap_sem is for serializing page faults with truncate / punch hole - * operations. We have to make sure that new page cannot be faulted in - * a section of the inode that is being punched. We cannot easily use - * i_data_sem for this since we need protection for the whole punch - * operation and i_data_sem ranks below transaction start so we have - * to occasionally drop it. - */ - struct rw_semaphore i_mmap_sem; struct inode vfs_inode; struct jbd2_inode *jinode; @@ -2972,7 +2963,6 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); -extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); extern void ext4_da_release_space(struct inode *inode, int to_free); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 92ad64b89d9b..c33e0a2cb6c3 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4474,6 +4474,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, loff_t len, int mode) { struct inode *inode = file_inode(file); + struct address_space *mapping = file->f_mapping; handle_t *handle = NULL; unsigned int max_blocks; loff_t new_size = 0; @@ -4560,17 +4561,17 @@ static long ext4_zero_range(struct file *file, loff_t offset, * Prevent page faults from reinstantiating pages we have * released from page cache. */ - down_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); ret = ext4_break_layouts(inode); if (ret) { - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); goto out_mutex; } ret = ext4_update_disksize_before_punch(inode, offset, len); if (ret) { - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); goto out_mutex; } /* Now release the pages and zero block aligned part of pages */ @@ -4579,7 +4580,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); if (ret) goto out_mutex; } @@ -5221,6 +5222,7 @@ out: static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) { struct super_block *sb = inode->i_sb; + struct address_space *mapping = inode->i_mapping; ext4_lblk_t punch_start, punch_stop; handle_t *handle; unsigned int credits; @@ -5274,7 +5276,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) * Prevent page faults from reinstantiating pages we have released from * page cache. */ - down_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); ret = ext4_break_layouts(inode); if (ret) @@ -5289,15 +5291,15 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) * Write tail of the last page before removed range since it will get * removed from the page cache below. */ - ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset); + ret = filemap_write_and_wait_range(mapping, ioffset, offset); if (ret) goto out_mmap; /* * Write data that will be shifted to preserve them when discarding * page cache below. We are also protected from pages becoming dirty - * by i_mmap_sem. + * by i_rwsem and invalidate_lock. */ - ret = filemap_write_and_wait_range(inode->i_mapping, offset + len, + ret = filemap_write_and_wait_range(mapping, offset + len, LLONG_MAX); if (ret) goto out_mmap; @@ -5350,7 +5352,7 @@ out_stop: ext4_journal_stop(handle); ext4_fc_stop_ineligible(sb); out_mmap: - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); out_mutex: inode_unlock(inode); return ret; @@ -5367,6 +5369,7 @@ out_mutex: static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) { struct super_block *sb = inode->i_sb; + struct address_space *mapping = inode->i_mapping; handle_t *handle; struct ext4_ext_path *path; struct ext4_extent *extent; @@ -5425,7 +5428,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) * Prevent page faults from reinstantiating pages we have released from * page cache. */ - down_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); ret = ext4_break_layouts(inode); if (ret) @@ -5526,7 +5529,7 @@ out_stop: ext4_journal_stop(handle); ext4_fc_stop_ineligible(sb); out_mmap: - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); out_mutex: inode_unlock(inode); return ret; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 816dedcbd541..d3b4ed91aa68 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -704,22 +704,23 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, */ bool write = (vmf->flags & FAULT_FLAG_WRITE) && (vmf->vma->vm_flags & VM_SHARED); + struct address_space *mapping = vmf->vma->vm_file->f_mapping; pfn_t pfn; if (write) { sb_start_pagefault(sb); file_update_time(vmf->vma->vm_file); - down_read(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock_shared(mapping); retry: handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, EXT4_DATA_TRANS_BLOCKS(sb)); if (IS_ERR(handle)) { - up_read(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(sb); return VM_FAULT_SIGBUS; } } else { - down_read(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock_shared(mapping); } result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops); if (write) { @@ -731,10 +732,10 @@ retry: /* Handling synchronous page fault? */ if (result & VM_FAULT_NEEDDSYNC) result = dax_finish_sync_fault(vmf, pe_size, pfn); - up_read(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(sb); } else { - up_read(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock_shared(mapping); } return result; @@ -756,7 +757,7 @@ static const struct vm_operations_struct ext4_dax_vm_ops = { #endif static const struct vm_operations_struct ext4_file_vm_ops = { - .fault = ext4_filemap_fault, + .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = ext4_page_mkwrite, }; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d8de607849df..325c038e7b23 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3950,20 +3950,19 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, return ret; } -static void ext4_wait_dax_page(struct ext4_inode_info *ei) +static void ext4_wait_dax_page(struct inode *inode) { - up_write(&ei->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); schedule(); - down_write(&ei->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); } int ext4_break_layouts(struct inode *inode) { - struct ext4_inode_info *ei = EXT4_I(inode); struct page *page; int error; - if (WARN_ON_ONCE(!rwsem_is_locked(&ei->i_mmap_sem))) + if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock))) return -EINVAL; do { @@ -3974,7 +3973,7 @@ int ext4_break_layouts(struct inode *inode) error = ___wait_var_event(&page->_refcount, atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, 0, 0, - ext4_wait_dax_page(ei)); + ext4_wait_dax_page(inode)); } while (error == 0); return error; @@ -4005,9 +4004,9 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); if (ext4_has_inline_data(inode)) { - down_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); ret = ext4_convert_inline_data(inode); - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); if (ret) return ret; } @@ -4058,7 +4057,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) * Prevent page faults from reinstantiating pages we have released from * page cache. */ - down_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); ret = ext4_break_layouts(inode); if (ret) @@ -4131,7 +4130,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) out_stop: ext4_journal_stop(handle); out_dio: - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); out_mutex: inode_unlock(inode); return ret; @@ -5426,11 +5425,11 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, inode_dio_wait(inode); } - down_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); rc = ext4_break_layouts(inode); if (rc) { - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); goto err_out; } @@ -5506,7 +5505,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, error = rc; } out_mmap_sem: - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); } if (!error) { @@ -5983,10 +5982,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) * data (and journalled aops don't know how to handle these cases). */ if (val) { - down_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); err = filemap_write_and_wait(inode->i_mapping); if (err < 0) { - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); return err; } } @@ -6019,7 +6018,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) percpu_up_write(&sbi->s_writepages_rwsem); if (val) - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); /* Finally we can mark the inode as dirty. */ @@ -6063,7 +6062,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); - down_read(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock_shared(mapping); err = ext4_convert_inline_data(inode); if (err) @@ -6176,7 +6175,7 @@ retry_alloc: out_ret: ret = block_page_mkwrite_return(err); out: - up_read(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(inode->i_sb); return ret; out_error: @@ -6184,15 +6183,3 @@ out_error: ext4_journal_stop(handle); goto out; } - -vm_fault_t ext4_filemap_fault(struct vm_fault *vmf) -{ - struct inode *inode = file_inode(vmf->vma->vm_file); - vm_fault_t ret; - - down_read(&EXT4_I(inode)->i_mmap_sem); - ret = filemap_fault(vmf); - up_read(&EXT4_I(inode)->i_mmap_sem); - - return ret; -} diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 6eed6170aded..4fb5fe083c2b 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -148,7 +148,7 @@ static long swap_inode_boot_loader(struct super_block *sb, goto journal_err_out; } - down_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); err = filemap_write_and_wait(inode->i_mapping); if (err) goto err_out; @@ -256,7 +256,7 @@ err_out1: ext4_double_up_write_data_sem(inode, inode_bl); err_out: - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); journal_err_out: unlock_two_nondirectories(inode, inode_bl); iput(inode_bl); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dfa09a277b56..d6df62fc810c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -90,12 +90,9 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, /* * Lock ordering * - * Note the difference between i_mmap_sem (EXT4_I(inode)->i_mmap_sem) and - * i_mmap_rwsem (inode->i_mmap_rwsem)! - * * page fault path: - * mmap_lock -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start -> - * page lock -> i_data_sem (rw) + * mmap_lock -> sb_start_pagefault -> invalidate_lock (r) -> transaction start + * -> page lock -> i_data_sem (rw) * * buffered write path: * sb_start_write -> i_mutex -> mmap_lock @@ -103,8 +100,9 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, * i_data_sem (rw) * * truncate: - * sb_start_write -> i_mutex -> i_mmap_sem (w) -> i_mmap_rwsem (w) -> page lock - * sb_start_write -> i_mutex -> i_mmap_sem (w) -> transaction start -> + * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) -> + * page lock + * sb_start_write -> i_mutex -> invalidate_lock (w) -> transaction start -> * i_data_sem (rw) * * direct IO: @@ -1360,7 +1358,6 @@ static void init_once(void *foo) INIT_LIST_HEAD(&ei->i_orphan); init_rwsem(&ei->xattr_sem); init_rwsem(&ei->i_data_sem); - init_rwsem(&ei->i_mmap_sem); inode_init_once(&ei->vfs_inode); ext4_fc_init_inode(&ei->vfs_inode); } diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h index bcbe3668c1d4..ce84aa2786c7 100644 --- a/fs/ext4/truncate.h +++ b/fs/ext4/truncate.h @@ -11,14 +11,16 @@ */ static inline void ext4_truncate_failed_write(struct inode *inode) { + struct address_space *mapping = inode->i_mapping; + /* * We don't need to call ext4_break_layouts() because the blocks we * are truncating were never visible to userspace. */ - down_write(&EXT4_I(inode)->i_mmap_sem); - truncate_inode_pages(inode->i_mapping, inode->i_size); + filemap_invalidate_lock(mapping); + truncate_inode_pages(mapping, inode->i_size); ext4_truncate(inode); - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); } /* diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d2cf48c5a2e4..eb222b35edef 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3187,12 +3187,12 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) /* In the fs-verity case, f2fs_end_enable_verity() does the truncate */ if (to > i_size && !f2fs_verity_in_progress(inode)) { down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); truncate_pagecache(inode, i_size); f2fs_truncate_blocks(inode, i_size, true); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } } @@ -3852,7 +3852,7 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, int ret = 0; down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); set_inode_flag(inode, FI_ALIGNED_WRITE); @@ -3894,7 +3894,7 @@ done: clear_inode_flag(inode, FI_DO_DEFRAG); clear_inode_flag(inode, FI_ALIGNED_WRITE); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ee8eb33e2c25..906b2c4b50e7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -754,7 +754,6 @@ struct f2fs_inode_info { /* avoid racing between foreground op and gc */ struct rw_semaphore i_gc_rwsem[2]; - struct rw_semaphore i_mmap_sem; struct rw_semaphore i_xattr_sem; /* avoid racing between reading and changing EAs */ int i_extra_isize; /* size of extra space located in i_addr */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 6afd4562335f..1ff333755721 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -38,10 +38,7 @@ static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf) struct inode *inode = file_inode(vmf->vma->vm_file); vm_fault_t ret; - down_read(&F2FS_I(inode)->i_mmap_sem); ret = filemap_fault(vmf); - up_read(&F2FS_I(inode)->i_mmap_sem); - if (!ret) f2fs_update_iostat(F2FS_I_SB(inode), APP_MAPPED_READ_IO, F2FS_BLKSIZE); @@ -101,7 +98,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); file_update_time(vmf->vma->vm_file); - down_read(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock_shared(inode->i_mapping); lock_page(page); if (unlikely(page->mapping != inode->i_mapping || page_offset(page) > i_size_read(inode) || @@ -159,7 +156,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) trace_f2fs_vm_page_mkwrite(page, DATA); out_sem: - up_read(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock_shared(inode->i_mapping); sb_end_pagefault(inode->i_sb); err: @@ -940,7 +937,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); truncate_setsize(inode, attr->ia_size); @@ -950,7 +947,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, * do not trim all blocks after i_size if target size is * larger than i_size. */ - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (err) return err; @@ -1095,7 +1092,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) blk_end = (loff_t)pg_end << PAGE_SHIFT; down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); truncate_inode_pages_range(mapping, blk_start, blk_end - 1); @@ -1104,7 +1101,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) ret = f2fs_truncate_hole(inode, pg_start, pg_end); f2fs_unlock_op(sbi); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } } @@ -1339,7 +1336,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) /* avoid gc operation during block exchange */ down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); f2fs_lock_op(sbi); f2fs_drop_extent_tree(inode); @@ -1347,7 +1344,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); f2fs_unlock_op(sbi); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } @@ -1378,13 +1375,13 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) return ret; /* write out all moved pages, if possible */ - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); truncate_pagecache(inode, offset); new_size = i_size_read(inode) - len; ret = f2fs_truncate_blocks(inode, new_size, true); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); if (!ret) f2fs_i_size_write(inode, new_size); return ret; @@ -1484,7 +1481,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, pgoff_t end; down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); truncate_pagecache_range(inode, (loff_t)index << PAGE_SHIFT, @@ -1496,7 +1493,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE); if (ret) { f2fs_unlock_op(sbi); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); goto out; } @@ -1508,7 +1505,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); f2fs_balance_fs(sbi, dn.node_changed); @@ -1543,6 +1540,7 @@ out: static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct address_space *mapping = inode->i_mapping; pgoff_t nr, pg_start, pg_end, delta, idx; loff_t new_size; int ret = 0; @@ -1565,14 +1563,14 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); if (ret) return ret; /* write out all dirty pages from offset */ - ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + ret = filemap_write_and_wait_range(mapping, offset, LLONG_MAX); if (ret) return ret; @@ -1583,7 +1581,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) /* avoid gc operation during block exchange */ down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); truncate_pagecache(inode, offset); while (!ret && idx > pg_start) { @@ -1599,14 +1597,14 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) idx + delta, nr, false); f2fs_unlock_op(sbi); } - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); /* write out all moved pages, if possible */ - down_write(&F2FS_I(inode)->i_mmap_sem); - filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + filemap_invalidate_lock(mapping); + filemap_write_and_wait_range(mapping, offset, LLONG_MAX); truncate_pagecache(inode, offset); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); if (!ret) f2fs_i_size_write(inode, new_size); @@ -3440,7 +3438,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) goto out; down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); @@ -3476,7 +3474,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) } up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); out: inode_unlock(inode); @@ -3593,7 +3591,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) } down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); @@ -3629,7 +3627,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) } up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); if (ret >= 0) { clear_inode_flag(inode, FI_COMPRESS_RELEASED); @@ -3748,7 +3746,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg) goto err; down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); ret = filemap_write_and_wait_range(mapping, range.start, to_end ? LLONG_MAX : end_addr - 1); @@ -3835,7 +3833,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg) ret = f2fs_secure_erase(prev_bdev, inode, prev_index, prev_block, len, range.flags); out: - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); err: inode_unlock(inode); @@ -4313,9 +4311,9 @@ write: /* if we couldn't write data, we should deallocate blocks. */ if (preallocated && i_size_read(inode) < target_size) { down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); f2fs_truncate(inode); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8fecd3050ccd..ce2ab1b85c11 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1289,7 +1289,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) mutex_init(&fi->inmem_lock); init_rwsem(&fi->i_gc_rwsem[READ]); init_rwsem(&fi->i_gc_rwsem[WRITE]); - init_rwsem(&fi->i_mmap_sem); init_rwsem(&fi->i_xattr_sem); /* Will be used by directory only */ diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 6642246206bd..daad532a4e2b 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -378,7 +378,7 @@ out: ret = kstrtol(name, 10, &data); if (ret) return ret; - if (data >= IOPRIO_BE_NR || data < 0) + if (data >= IOPRIO_NR_LEVELS || data < 0) return -EINVAL; cprc->ckpt_thread_ioprio = IOPRIO_PRIO_VALUE(class, data); diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 860e884e56e8..978ac6751aeb 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -5,6 +5,7 @@ #include <linux/blkdev.h> #include <linux/sched/signal.h> +#include <linux/backing-dev-defs.h> #include "fat.h" struct fatent_operations { diff --git a/fs/fcntl.c b/fs/fcntl.c index f946bec8f1f1..68added37c15 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -150,7 +150,8 @@ void f_delown(struct file *filp) pid_t f_getown(struct file *filp) { pid_t pid = 0; - read_lock(&filp->f_owner.lock); + + read_lock_irq(&filp->f_owner.lock); rcu_read_lock(); if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) { pid = pid_vnr(filp->f_owner.pid); @@ -158,7 +159,7 @@ pid_t f_getown(struct file *filp) pid = -pid; } rcu_read_unlock(); - read_unlock(&filp->f_owner.lock); + read_unlock_irq(&filp->f_owner.lock); return pid; } @@ -208,7 +209,7 @@ static int f_getown_ex(struct file *filp, unsigned long arg) struct f_owner_ex owner = {}; int ret = 0; - read_lock(&filp->f_owner.lock); + read_lock_irq(&filp->f_owner.lock); rcu_read_lock(); if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) owner.pid = pid_vnr(filp->f_owner.pid); @@ -231,7 +232,7 @@ static int f_getown_ex(struct file *filp, unsigned long arg) ret = -EINVAL; break; } - read_unlock(&filp->f_owner.lock); + read_unlock_irq(&filp->f_owner.lock); if (!ret) { ret = copy_to_user(owner_p, &owner, sizeof(owner)); @@ -249,10 +250,10 @@ static int f_getowner_uids(struct file *filp, unsigned long arg) uid_t src[2]; int err; - read_lock(&filp->f_owner.lock); + read_lock_irq(&filp->f_owner.lock); src[0] = from_kuid(user_ns, filp->f_owner.uid); src[1] = from_kuid(user_ns, filp->f_owner.euid); - read_unlock(&filp->f_owner.lock); + read_unlock_irq(&filp->f_owner.lock); err = put_user(src[0], &dst[0]); err |= put_user(src[1], &dst[1]); @@ -1003,13 +1004,14 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) { while (fa) { struct fown_struct *fown; + unsigned long flags; if (fa->magic != FASYNC_MAGIC) { printk(KERN_ERR "kill_fasync: bad magic number in " "fasync_struct!\n"); return; } - read_lock(&fa->fa_lock); + read_lock_irqsave(&fa->fa_lock, flags); if (fa->fa_file) { fown = &fa->fa_file->f_owner; /* Don't send SIGURG to processes which have not set a @@ -1018,7 +1020,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) if (!(sig == SIGURG && fown->signum == 0)) send_sigio(fown, fa->fa_fd, band); } - read_unlock(&fa->fa_lock); + read_unlock_irqrestore(&fa->fa_lock, flags); fa = rcu_dereference(fa->fa_next); } } diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 9d58371d22c2..281d79f8b3d3 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -444,12 +444,12 @@ static int fuse_setup_new_dax_mapping(struct inode *inode, loff_t pos, /* * Can't do inline reclaim in fault path. We call * dax_layout_busy_page() before we free a range. And - * fuse_wait_dax_page() drops fi->i_mmap_sem lock and requires it. - * In fault path we enter with fi->i_mmap_sem held and can't drop - * it. Also in fault path we hold fi->i_mmap_sem shared and not - * exclusive, so that creates further issues with fuse_wait_dax_page(). - * Hence return -EAGAIN and fuse_dax_fault() will wait for a memory - * range to become free and retry. + * fuse_wait_dax_page() drops mapping->invalidate_lock and requires it. + * In fault path we enter with mapping->invalidate_lock held and can't + * drop it. Also in fault path we hold mapping->invalidate_lock shared + * and not exclusive, so that creates further issues with + * fuse_wait_dax_page(). Hence return -EAGAIN and fuse_dax_fault() + * will wait for a memory range to become free and retry. */ if (flags & IOMAP_FAULT) { alloc_dmap = alloc_dax_mapping(fcd); @@ -513,7 +513,7 @@ static int fuse_upgrade_dax_mapping(struct inode *inode, loff_t pos, down_write(&fi->dax->sem); node = interval_tree_iter_first(&fi->dax->tree, idx, idx); - /* We are holding either inode lock or i_mmap_sem, and that should + /* We are holding either inode lock or invalidate_lock, and that should * ensure that dmap can't be truncated. We are holding a reference * on dmap and that should make sure it can't be reclaimed. So dmap * should still be there in tree despite the fact we dropped and @@ -660,14 +660,12 @@ static const struct iomap_ops fuse_iomap_ops = { static void fuse_wait_dax_page(struct inode *inode) { - struct fuse_inode *fi = get_fuse_inode(inode); - - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); schedule(); - down_write(&fi->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); } -/* Should be called with fi->i_mmap_sem lock held exclusively */ +/* Should be called with mapping->invalidate_lock held exclusively */ static int __fuse_dax_break_layouts(struct inode *inode, bool *retry, loff_t start, loff_t end) { @@ -813,18 +811,18 @@ retry: * we do not want any read/write/mmap to make progress and try * to populate page cache or access memory we are trying to free. */ - down_read(&get_fuse_inode(inode)->i_mmap_sem); + filemap_invalidate_lock_shared(inode->i_mapping); ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops); if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) { error = 0; retry = true; - up_read(&get_fuse_inode(inode)->i_mmap_sem); + filemap_invalidate_unlock_shared(inode->i_mapping); goto retry; } if (ret & VM_FAULT_NEEDDSYNC) ret = dax_finish_sync_fault(vmf, pe_size, pfn); - up_read(&get_fuse_inode(inode)->i_mmap_sem); + filemap_invalidate_unlock_shared(inode->i_mapping); if (write) sb_end_pagefault(sb); @@ -960,7 +958,7 @@ inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode, int ret; struct interval_tree_node *node; - down_write(&fi->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); /* Lookup a dmap and corresponding file offset to reclaim. */ down_read(&fi->dax->sem); @@ -1021,7 +1019,7 @@ inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode, out_write_dmap_sem: up_write(&fi->dax->sem); out_mmap_sem: - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); return dmap; } @@ -1050,10 +1048,10 @@ alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode) * had a reference or some other temporary failure, * Try again. We want to give up inline reclaim only * if there is no range assigned to this node. Otherwise - * if a deadlock is possible if we sleep with fi->i_mmap_sem - * held and worker to free memory can't make progress due - * to unavailability of fi->i_mmap_sem lock. So sleep - * only if fi->dax->nr=0 + * if a deadlock is possible if we sleep with + * mapping->invalidate_lock held and worker to free memory + * can't make progress due to unavailability of + * mapping->invalidate_lock. So sleep only if fi->dax->nr=0 */ if (retry) continue; @@ -1061,8 +1059,8 @@ alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode) * There are no mappings which can be reclaimed. Wait for one. * We are not holding fi->dax->sem. So it is possible * that range gets added now. But as we are not holding - * fi->i_mmap_sem, worker should still be able to free up - * a range and wake us up. + * mapping->invalidate_lock, worker should still be able to + * free up a range and wake us up. */ if (!fi->dax->nr && !(fcd->nr_free_ranges > 0)) { if (wait_event_killable_exclusive(fcd->range_waitq, @@ -1108,7 +1106,7 @@ static int lookup_and_reclaim_dmap_locked(struct fuse_conn_dax *fcd, /* * Free a range of memory. * Locking: - * 1. Take fi->i_mmap_sem to block dax faults. + * 1. Take mapping->invalidate_lock to block dax faults. * 2. Take fi->dax->sem to protect interval tree and also to make sure * read/write can not reuse a dmap which we might be freeing. */ @@ -1122,7 +1120,7 @@ static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd, loff_t dmap_start = start_idx << FUSE_DAX_SHIFT; loff_t dmap_end = (dmap_start + FUSE_DAX_SZ) - 1; - down_write(&fi->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end); if (ret) { pr_debug("virtio_fs: fuse_dax_break_layouts() failed. err=%d\n", @@ -1134,7 +1132,7 @@ static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd, ret = lookup_and_reclaim_dmap_locked(fcd, inode, start_idx); up_write(&fi->dax->sem); out_mmap_sem: - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); return ret; } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index eade6f965b2e..d9b977c0f38d 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1556,6 +1556,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_conn *fc = fm->fc; struct fuse_inode *fi = get_fuse_inode(inode); + struct address_space *mapping = inode->i_mapping; FUSE_ARGS(args); struct fuse_setattr_in inarg; struct fuse_attr_out outarg; @@ -1580,11 +1581,11 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, } if (FUSE_IS_DAX(inode) && is_truncate) { - down_write(&fi->i_mmap_sem); + filemap_invalidate_lock(mapping); fault_blocked = true; err = fuse_dax_break_layouts(inode, 0, 0); if (err) { - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(mapping); return err; } } @@ -1694,13 +1695,13 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, if ((is_truncate || !is_wb) && S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { truncate_pagecache(inode, outarg.attr.size); - invalidate_inode_pages2(inode->i_mapping); + invalidate_inode_pages2(mapping); } clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); out: if (fault_blocked) - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(mapping); return 0; @@ -1711,7 +1712,7 @@ error: clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); if (fault_blocked) - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(mapping); return err; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 97f860cfc195..621a662c19fb 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -243,7 +243,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) } if (dax_truncate) { - down_write(&get_fuse_inode(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); err = fuse_dax_break_layouts(inode, 0, 0); if (err) goto out; @@ -255,7 +255,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) out: if (dax_truncate) - up_write(&get_fuse_inode(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); if (is_wb_truncate | dax_truncate) { fuse_release_nowrite(inode); @@ -2920,7 +2920,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, if (lock_inode) { inode_lock(inode); if (block_faults) { - down_write(&fi->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); err = fuse_dax_break_layouts(inode, 0, 0); if (err) goto out; @@ -2976,7 +2976,7 @@ out: clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); if (block_faults) - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); if (lock_inode) inode_unlock(inode); @@ -3045,7 +3045,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, * modifications. Yet this does give less guarantees than if the * copying was performed with write(2). * - * To fix this a i_mmap_sem style lock could be used to prevent new + * To fix this a mapping->invalidate_lock could be used to prevent new * faults while the copy is ongoing. */ err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 07829ce78695..6fb639b97ea8 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -149,13 +149,6 @@ struct fuse_inode { /** Lock to protect write related fields */ spinlock_t lock; - /** - * Can't take inode lock in fault path (leads to circular dependency). - * Introduce another semaphore which can be taken in fault path and - * then other filesystem paths can take this to block faults. - */ - struct rw_semaphore i_mmap_sem; - #ifdef CONFIG_FUSE_DAX /* * Dax specific inode data diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index b9beb39a4a18..e07e429f32e1 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -85,7 +85,6 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) fi->orig_ino = 0; fi->state = 0; mutex_init(&fi->mutex); - init_rwsem(&fi->i_mmap_sem); spin_lock_init(&fi->lock); fi->forget = fuse_alloc_forget(); if (!fi->forget) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 84ec053d43b4..c559827cb6f9 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1237,9 +1237,6 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_POSIX)) return -ENOLCK; - if (__mandatory_lock(&ip->i_inode) && fl->fl_type != F_UNLCK) - return -ENOLCK; - if (cmd == F_CANCELLK) { /* Hack: */ cmd = F_SETLK; diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig index 2b36dc6f0a10..ec975f466877 100644 --- a/fs/hpfs/Kconfig +++ b/fs/hpfs/Kconfig @@ -2,6 +2,7 @@ config HPFS_FS tristate "OS/2 HPFS file system support" depends on BLOCK + select FS_IOMAP help OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS is the file system used for organizing files on OS/2 hard disk diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index c3a49aacf20a..fb37f57130aa 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -9,6 +9,7 @@ #include "hpfs_fn.h" #include <linux/mpage.h> +#include <linux/iomap.h> #include <linux/fiemap.h> #define BLOCKS(size) (((size) + 511) >> 9) @@ -116,6 +117,47 @@ static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_he return r; } +static int hpfs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned flags, struct iomap *iomap, struct iomap *srcmap) +{ + struct super_block *sb = inode->i_sb; + unsigned int blkbits = inode->i_blkbits; + unsigned int n_secs; + secno s; + + if (WARN_ON_ONCE(flags & (IOMAP_WRITE | IOMAP_ZERO))) + return -EINVAL; + + iomap->bdev = inode->i_sb->s_bdev; + iomap->offset = offset; + + hpfs_lock(sb); + s = hpfs_bmap(inode, offset >> blkbits, &n_secs); + if (s) { + n_secs = hpfs_search_hotfix_map_for_range(sb, s, + min_t(loff_t, n_secs, length)); + if (unlikely(!n_secs)) { + s = hpfs_search_hotfix_map(sb, s); + n_secs = 1; + } + iomap->type = IOMAP_MAPPED; + iomap->flags = IOMAP_F_MERGED; + iomap->addr = (u64)s << blkbits; + iomap->length = (u64)n_secs << blkbits; + } else { + iomap->type = IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; + iomap->length = 1 << blkbits; + } + + hpfs_unlock(sb); + return 0; +} + +static const struct iomap_ops hpfs_iomap_ops = { + .iomap_begin = hpfs_iomap_begin, +}; + static int hpfs_readpage(struct file *file, struct page *page) { return mpage_readpage(page, hpfs_get_block); @@ -192,7 +234,14 @@ static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block) static int hpfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { - return generic_block_fiemap(inode, fieinfo, start, len, hpfs_get_block); + int ret; + + inode_lock(inode); + len = min_t(u64, len, i_size_read(inode)); + ret = iomap_fiemap(inode, fieinfo, start, len, &hpfs_iomap_ops); + inode_unlock(inode); + + return ret; } const struct address_space_operations hpfs_aops = { diff --git a/fs/inode.c b/fs/inode.c index c93500d84264..84c528cd1955 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -190,6 +190,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode) mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); mapping->private_data = NULL; mapping->writeback_index = 0; + __init_rwsem(&mapping->invalidate_lock, "mapping.invalidate_lock", + &sb->s_type->invalidate_lock_key); inode->i_private = NULL; inode->i_mapping = mapping; INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ diff --git a/fs/io-wq.c b/fs/io-wq.c index 4ce83bb48021..cd9bd095fb1b 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -178,7 +178,7 @@ static void io_worker_exit(struct io_worker *worker) complete(&worker->ref_done); wait_for_completion(&worker->ref_done); - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); if (worker->flags & IO_WORKER_F_FREE) hlist_nulls_del_rcu(&worker->nulls_node); list_del_rcu(&worker->all_list); @@ -188,7 +188,7 @@ static void io_worker_exit(struct io_worker *worker) worker->flags = 0; current->flags &= ~PF_IO_WORKER; preempt_enable(); - raw_spin_unlock_irq(&wqe->lock); + raw_spin_unlock(&wqe->lock); kfree_rcu(worker, rcu); io_worker_ref_put(wqe->wq); @@ -254,18 +254,19 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) if (!ret) { bool do_create = false, first = false; - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); if (acct->nr_workers < acct->max_workers) { - atomic_inc(&acct->nr_running); - atomic_inc(&wqe->wq->worker_refs); if (!acct->nr_workers) first = true; acct->nr_workers++; do_create = true; } - raw_spin_unlock_irq(&wqe->lock); - if (do_create) + raw_spin_unlock(&wqe->lock); + if (do_create) { + atomic_inc(&acct->nr_running); + atomic_inc(&wqe->wq->worker_refs); create_io_worker(wqe->wq, wqe, acct->index, first); + } } } @@ -288,14 +289,14 @@ static void create_worker_cb(struct callback_head *cb) wqe = worker->wqe; wq = wqe->wq; acct = &wqe->acct[worker->create_index]; - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); if (acct->nr_workers < acct->max_workers) { if (!acct->nr_workers) first = true; acct->nr_workers++; do_create = true; } - raw_spin_unlock_irq(&wqe->lock); + raw_spin_unlock(&wqe->lock); if (do_create) { create_io_worker(wq, wqe, worker->create_index, first); } else { @@ -423,7 +424,28 @@ static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash) spin_unlock(&wq->hash->wait.lock); } -static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) +/* + * We can always run the work if the worker is currently the same type as + * the work (eg both are bound, or both are unbound). If they are not the + * same, only allow it if incrementing the worker count would be allowed. + */ +static bool io_worker_can_run_work(struct io_worker *worker, + struct io_wq_work *work) +{ + struct io_wqe_acct *acct; + + if (!(worker->flags & IO_WORKER_F_BOUND) != + !(work->flags & IO_WQ_WORK_UNBOUND)) + return true; + + /* not the same type, check if we'd go over the limit */ + acct = io_work_get_acct(worker->wqe, work); + return acct->nr_workers < acct->max_workers; +} + +static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, + struct io_worker *worker, + bool *stalled) __must_hold(wqe->lock) { struct io_wq_work_node *node, *prev; @@ -435,6 +457,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) work = container_of(node, struct io_wq_work, list); + if (!io_worker_can_run_work(worker, work)) + break; + /* not hashed, can run anytime */ if (!io_wq_is_hashed(work)) { wq_list_del(&wqe->work_list, node, prev); @@ -461,6 +486,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) raw_spin_unlock(&wqe->lock); io_wait_on_hash(wqe, stall_hash); raw_spin_lock(&wqe->lock); + *stalled = true; } return NULL; @@ -484,9 +510,9 @@ static void io_assign_current_work(struct io_worker *worker, cond_resched(); } - spin_lock_irq(&worker->lock); + spin_lock(&worker->lock); worker->cur_work = work; - spin_unlock_irq(&worker->lock); + spin_unlock(&worker->lock); } static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work); @@ -500,6 +526,7 @@ static void io_worker_handle_work(struct io_worker *worker) do { struct io_wq_work *work; + bool stalled; get_next: /* * If we got some work, mark us as busy. If we didn't, but @@ -508,13 +535,14 @@ get_next: * can't make progress, any work completion or insertion will * clear the stalled flag. */ - work = io_get_next_work(wqe); + stalled = false; + work = io_get_next_work(wqe, worker, &stalled); if (work) __io_worker_busy(wqe, worker, work); - else if (!wq_list_empty(&wqe->work_list)) + else if (stalled) wqe->flags |= IO_WQE_FLAG_STALLED; - raw_spin_unlock_irq(&wqe->lock); + raw_spin_unlock(&wqe->lock); if (!work) break; io_assign_current_work(worker, work); @@ -546,16 +574,16 @@ get_next: clear_bit(hash, &wq->hash->map); if (wq_has_sleeper(&wq->hash->wait)) wake_up(&wq->hash->wait); - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); wqe->flags &= ~IO_WQE_FLAG_STALLED; /* skip unnecessary unlock-lock wqe->lock */ if (!work) goto get_next; - raw_spin_unlock_irq(&wqe->lock); + raw_spin_unlock(&wqe->lock); } } while (work); - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); } while (1); } @@ -576,13 +604,13 @@ static int io_wqe_worker(void *data) set_current_state(TASK_INTERRUPTIBLE); loop: - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); if (io_wqe_run_queue(wqe)) { io_worker_handle_work(worker); goto loop; } __io_worker_idle(wqe, worker); - raw_spin_unlock_irq(&wqe->lock); + raw_spin_unlock(&wqe->lock); if (io_flush_signals()) continue; ret = schedule_timeout(WORKER_IDLE_TIMEOUT); @@ -601,7 +629,7 @@ loop: } if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) { - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); io_worker_handle_work(worker); } @@ -643,9 +671,9 @@ void io_wq_worker_sleeping(struct task_struct *tsk) worker->flags &= ~IO_WORKER_F_RUNNING; - raw_spin_lock_irq(&worker->wqe->lock); + raw_spin_lock(&worker->wqe->lock); io_wqe_dec_running(worker); - raw_spin_unlock_irq(&worker->wqe->lock); + raw_spin_unlock(&worker->wqe->lock); } static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index, bool first) @@ -671,9 +699,9 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index, bo kfree(worker); fail: atomic_dec(&acct->nr_running); - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); acct->nr_workers--; - raw_spin_unlock_irq(&wqe->lock); + raw_spin_unlock(&wqe->lock); io_worker_ref_put(wq); return; } @@ -683,7 +711,7 @@ fail: set_cpus_allowed_ptr(tsk, wqe->cpu_mask); tsk->flags |= PF_NO_SETAFFINITY; - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); list_add_tail_rcu(&worker->all_list, &wqe->all_list); worker->flags |= IO_WORKER_F_FREE; @@ -691,7 +719,7 @@ fail: worker->flags |= IO_WORKER_F_BOUND; if (first && (worker->flags & IO_WORKER_F_BOUND)) worker->flags |= IO_WORKER_F_FIXED; - raw_spin_unlock_irq(&wqe->lock); + raw_spin_unlock(&wqe->lock); wake_up_new_task(tsk); } @@ -766,8 +794,7 @@ append: static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) { struct io_wqe_acct *acct = io_work_get_acct(wqe, work); - int work_flags; - unsigned long flags; + bool do_wake; /* * If io-wq is exiting for this task, or if the request has explicitly @@ -779,14 +806,14 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) return; } - work_flags = work->flags; - raw_spin_lock_irqsave(&wqe->lock, flags); + raw_spin_lock(&wqe->lock); io_wqe_insert_work(wqe, work); wqe->flags &= ~IO_WQE_FLAG_STALLED; - raw_spin_unlock_irqrestore(&wqe->lock, flags); + do_wake = (work->flags & IO_WQ_WORK_CONCURRENT) || + !atomic_read(&acct->nr_running); + raw_spin_unlock(&wqe->lock); - if ((work_flags & IO_WQ_WORK_CONCURRENT) || - !atomic_read(&acct->nr_running)) + if (do_wake) io_wqe_wake_worker(wqe, acct); } @@ -812,19 +839,18 @@ void io_wq_hash_work(struct io_wq_work *work, void *val) static bool io_wq_worker_cancel(struct io_worker *worker, void *data) { struct io_cb_cancel_data *match = data; - unsigned long flags; /* * Hold the lock to avoid ->cur_work going out of scope, caller * may dereference the passed in work. */ - spin_lock_irqsave(&worker->lock, flags); + spin_lock(&worker->lock); if (worker->cur_work && match->fn(worker->cur_work, match->data)) { set_notify_signal(worker->task); match->nr_running++; } - spin_unlock_irqrestore(&worker->lock, flags); + spin_unlock(&worker->lock); return match->nr_running && !match->cancel_all; } @@ -852,16 +878,15 @@ static void io_wqe_cancel_pending_work(struct io_wqe *wqe, { struct io_wq_work_node *node, *prev; struct io_wq_work *work; - unsigned long flags; retry: - raw_spin_lock_irqsave(&wqe->lock, flags); + raw_spin_lock(&wqe->lock); wq_list_for_each(node, prev, &wqe->work_list) { work = container_of(node, struct io_wq_work, list); if (!match->fn(work, match->data)) continue; io_wqe_remove_pending(wqe, work, prev); - raw_spin_unlock_irqrestore(&wqe->lock, flags); + raw_spin_unlock(&wqe->lock); io_run_cancel(work, wqe); match->nr_pending++; if (!match->cancel_all) @@ -870,7 +895,7 @@ retry: /* not safe to continue after unlock */ goto retry; } - raw_spin_unlock_irqrestore(&wqe->lock, flags); + raw_spin_unlock(&wqe->lock); } static void io_wqe_cancel_running_work(struct io_wqe *wqe, @@ -1151,6 +1176,35 @@ int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask) return 0; } +/* + * Set max number of unbounded workers, returns old value. If new_count is 0, + * then just return the old value. + */ +int io_wq_max_workers(struct io_wq *wq, int *new_count) +{ + int i, node, prev = 0; + + for (i = 0; i < 2; i++) { + if (new_count[i] > task_rlimit(current, RLIMIT_NPROC)) + new_count[i] = task_rlimit(current, RLIMIT_NPROC); + } + + rcu_read_lock(); + for_each_node(node) { + struct io_wqe_acct *acct; + + for (i = 0; i < 2; i++) { + acct = &wq->wqes[node]->acct[i]; + prev = max_t(int, acct->max_workers, prev); + if (new_count[i]) + acct->max_workers = new_count[i]; + new_count[i] = prev; + } + } + rcu_read_unlock(); + return 0; +} + static __init int io_wq_init(void) { int ret; diff --git a/fs/io-wq.h b/fs/io-wq.h index 308af3928424..bf5c4c533760 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -128,6 +128,7 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); void io_wq_hash_work(struct io_wq_work *work, void *val); int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask); +int io_wq_max_workers(struct io_wq *wq, int *new_count); static inline bool io_wq_is_hashed(struct io_wq_work *work) { diff --git a/fs/io_uring.c b/fs/io_uring.c index 504aede8ca47..73928d957691 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -92,12 +92,12 @@ #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 -/* 512 entries per page on 64-bit archs, 64 pages max */ +/* only define max */ #define IORING_MAX_FIXED_FILES (1U << 15) #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) -#define IO_RSRC_TAG_TABLE_SHIFT 9 +#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) #define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT) #define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1) @@ -375,6 +375,7 @@ struct io_ring_ctx { struct io_submit_state submit_state; struct list_head timeout_list; + struct list_head ltimeout_list; struct list_head cq_overflow_list; struct xarray io_buffers; struct xarray personalities; @@ -508,6 +509,7 @@ struct io_timeout_data { struct hrtimer timer; struct timespec64 ts; enum hrtimer_mode mode; + u32 flags; }; struct io_accept { @@ -515,6 +517,7 @@ struct io_accept { struct sockaddr __user *addr; int __user *addr_len; int flags; + u32 file_slot; unsigned long nofile; }; @@ -549,6 +552,7 @@ struct io_timeout_rem { /* timeout update */ struct timespec64 ts; u32 flags; + bool ltimeout; }; struct io_rw { @@ -580,6 +584,7 @@ struct io_sr_msg { struct io_open { struct file *file; int dfd; + u32 file_slot; struct filename *filename; struct open_how how; unsigned long nofile; @@ -705,12 +710,12 @@ enum { REQ_F_NEED_CLEANUP_BIT, REQ_F_POLLED_BIT, REQ_F_BUFFER_SELECTED_BIT, - REQ_F_LTIMEOUT_ACTIVE_BIT, REQ_F_COMPLETE_INLINE_BIT, REQ_F_REISSUE_BIT, REQ_F_DONT_REISSUE_BIT, REQ_F_CREDS_BIT, REQ_F_REFCOUNT_BIT, + REQ_F_ARM_LTIMEOUT_BIT, /* keep async read/write and isreg together and in order */ REQ_F_NOWAIT_READ_BIT, REQ_F_NOWAIT_WRITE_BIT, @@ -750,8 +755,6 @@ enum { REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), /* buffer already selected */ REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), - /* linked timeout is active, i.e. prepared by link's head */ - REQ_F_LTIMEOUT_ACTIVE = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT), /* completion is deferred through io_comp_state */ REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT), /* caller should reissue async */ @@ -768,6 +771,8 @@ enum { REQ_F_CREDS = BIT(REQ_F_CREDS_BIT), /* skip refcounting if not set */ REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), + /* there is a linked timeout that has to be armed */ + REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), }; struct async_poll { @@ -775,7 +780,7 @@ struct async_poll { struct io_poll_iocb *double_poll; }; -typedef void (*io_req_tw_func_t)(struct io_kiocb *req); +typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); struct io_task_work { union { @@ -1034,6 +1039,9 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_UNLINKAT] = {}, }; +/* requests with any of those set should undergo io_disarm_next() */ +#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) + static bool io_disarm_next(struct io_kiocb *req); static void io_uring_del_tctx_node(unsigned long index); static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, @@ -1060,6 +1068,10 @@ static void io_req_task_queue(struct io_kiocb *req); static void io_submit_flush_completions(struct io_ring_ctx *ctx); static int io_req_prep_async(struct io_kiocb *req); +static int io_install_fixed_file(struct io_kiocb *req, struct file *file, + unsigned int issue_flags, u32 slot_index); +static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); + static struct kmem_cache *req_cachep; static const struct file_operations io_uring_fops; @@ -1077,6 +1089,14 @@ struct sock *io_uring_get_socket(struct file *file) } EXPORT_SYMBOL(io_uring_get_socket); +static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) +{ + if (!*locked) { + mutex_lock(&ctx->uring_lock); + *locked = true; + } +} + #define io_for_each_link(pos, head) \ for (pos = (head); pos; pos = pos->link) @@ -1115,14 +1135,19 @@ static inline void req_ref_get(struct io_kiocb *req) atomic_inc(&req->refs); } -static inline void io_req_refcount(struct io_kiocb *req) +static inline void __io_req_set_refcount(struct io_kiocb *req, int nr) { if (!(req->flags & REQ_F_REFCOUNT)) { req->flags |= REQ_F_REFCOUNT; - atomic_set(&req->refs, 1); + atomic_set(&req->refs, nr); } } +static inline void io_req_set_refcount(struct io_kiocb *req) +{ + __io_req_set_refcount(req, 1); +} + static inline void io_req_set_rsrc_node(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -1167,6 +1192,12 @@ static inline void req_set_fail(struct io_kiocb *req) req->flags |= REQ_F_FAIL; } +static inline void req_fail_link_node(struct io_kiocb *req, int res) +{ + req_set_fail(req); + req->result = res; +} + static void io_ring_ctx_ref_free(struct percpu_ref *ref) { struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); @@ -1185,11 +1216,19 @@ static void io_fallback_req_func(struct work_struct *work) fallback_work.work); struct llist_node *node = llist_del_all(&ctx->fallback_llist); struct io_kiocb *req, *tmp; + bool locked = false; percpu_ref_get(&ctx->refs); llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node) - req->io_task_work.func(req); + req->io_task_work.func(req, &locked); + + if (locked) { + if (ctx->submit_state.compl_nr) + io_submit_flush_completions(ctx); + mutex_unlock(&ctx->uring_lock); + } percpu_ref_put(&ctx->refs); + } static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) @@ -1241,6 +1280,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->iopoll_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); + INIT_LIST_HEAD(&ctx->ltimeout_list); spin_lock_init(&ctx->rsrc_ref_lock); INIT_LIST_HEAD(&ctx->rsrc_ref_list); INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); @@ -1298,27 +1338,28 @@ static void io_req_track_inflight(struct io_kiocb *req) } } -static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) +static inline void io_unprep_linked_timeout(struct io_kiocb *req) { - struct io_kiocb *nxt = req->link; + req->flags &= ~REQ_F_LINK_TIMEOUT; +} - if (req->flags & REQ_F_LINK_TIMEOUT) +static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) +{ + if (WARN_ON_ONCE(!req->link)) return NULL; - /* linked timeouts should have two refs once prep'ed */ - io_req_refcount(req); - io_req_refcount(nxt); - req_ref_get(nxt); - - nxt->timeout.head = req; - nxt->flags |= REQ_F_LTIMEOUT_ACTIVE; + req->flags &= ~REQ_F_ARM_LTIMEOUT; req->flags |= REQ_F_LINK_TIMEOUT; - return nxt; + + /* linked timeouts should have two refs once prep'ed */ + io_req_set_refcount(req); + __io_req_set_refcount(req->link, 2); + return req->link; } static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) { - if (likely(!req->link || req->link->opcode != IORING_OP_LINK_TIMEOUT)) + if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT))) return NULL; return __io_prep_linked_timeout(req); } @@ -1372,12 +1413,15 @@ static void io_prep_async_link(struct io_kiocb *req) } } -static void io_queue_async_work(struct io_kiocb *req) +static void io_queue_async_work(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *link = io_prep_linked_timeout(req); struct io_uring_task *tctx = req->task->io_uring; + /* must not take the lock, NULL it as a precaution */ + locked = NULL; + BUG_ON(!tctx); BUG_ON(!tctx->io_wq); @@ -1517,6 +1561,13 @@ static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) return !ctx->eventfd_async || io_wq_current_is_worker(); } +/* + * This should only get called when at least one event has been posted. + * Some applications rely on the eventfd notification count only changing + * IFF a new CQE has been added to the CQ ring. There's no depedency on + * 1:1 relationship between how many times this function is called (and + * hence the eventfd count) and number of CQEs posted to the CQ ring. + */ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) { /* @@ -1614,10 +1665,32 @@ static inline void io_put_task(struct task_struct *task, int nr) { struct io_uring_task *tctx = task->io_uring; - percpu_counter_sub(&tctx->inflight, nr); - if (unlikely(atomic_read(&tctx->in_idle))) - wake_up(&tctx->wait); - put_task_struct_many(task, nr); + if (likely(task == current)) { + tctx->cached_refs += nr; + } else { + percpu_counter_sub(&tctx->inflight, nr); + if (unlikely(atomic_read(&tctx->in_idle))) + wake_up(&tctx->wait); + put_task_struct_many(task, nr); + } +} + +static void io_task_refs_refill(struct io_uring_task *tctx) +{ + unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR; + + percpu_counter_add(&tctx->inflight, refill); + refcount_add(refill, ¤t->usage); + tctx->cached_refs += refill; +} + +static inline void io_get_task_refs(int nr) +{ + struct io_uring_task *tctx = current->io_uring; + + tctx->cached_refs -= nr; + if (unlikely(tctx->cached_refs < 0)) + io_task_refs_refill(tctx); } static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, @@ -1690,7 +1763,7 @@ static void io_req_complete_post(struct io_kiocb *req, long res, */ if (req_ref_put_and_test(req)) { if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { - if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL)) + if (req->flags & IO_DISARM_MASK) io_disarm_next(req); if (req->link) { io_req_task_queue(req->link); @@ -1891,16 +1964,13 @@ static bool io_kill_linked_timeout(struct io_kiocb *req) { struct io_kiocb *link = req->link; - /* - * Can happen if a linked timeout fired and link had been like - * req -> link t-out -> link t-out [-> ...] - */ - if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) { + if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { struct io_timeout_data *io = link->async_data; io_remove_next_linked(req); link->timeout.head = NULL; if (hrtimer_try_to_cancel(&io->timer) != -1) { + list_del(&link->timeout.list); io_cqring_fill_event(link->ctx, link->user_data, -ECANCELED, 0); io_put_req_deferred(link); @@ -1917,11 +1987,16 @@ static void io_fail_links(struct io_kiocb *req) req->link = NULL; while (link) { + long res = -ECANCELED; + + if (link->flags & REQ_F_FAIL) + res = link->result; + nxt = link->link; link->link = NULL; trace_io_uring_fail_link(req, link); - io_cqring_fill_event(link->ctx, link->user_data, -ECANCELED, 0); + io_cqring_fill_event(link->ctx, link->user_data, res, 0); io_put_req_deferred(link); link = nxt; } @@ -1932,7 +2007,18 @@ static bool io_disarm_next(struct io_kiocb *req) { bool posted = false; - if (likely(req->flags & REQ_F_LINK_TIMEOUT)) { + if (req->flags & REQ_F_ARM_LTIMEOUT) { + struct io_kiocb *link = req->link; + + req->flags &= ~REQ_F_ARM_LTIMEOUT; + if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { + io_remove_next_linked(req); + io_cqring_fill_event(link->ctx, link->user_data, + -ECANCELED, 0); + io_put_req_deferred(link); + posted = true; + } + } else if (req->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = req->ctx; spin_lock_irq(&ctx->timeout_lock); @@ -1957,7 +2043,7 @@ static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) * dependencies to the next request. In case of failure, fail the rest * of the chain. */ - if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL)) { + if (req->flags & IO_DISARM_MASK) { struct io_ring_ctx *ctx = req->ctx; bool posted; @@ -1981,20 +2067,22 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) return __io_req_find_next(req); } -static void ctx_flush_and_put(struct io_ring_ctx *ctx) +static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) { if (!ctx) return; - if (ctx->submit_state.compl_nr) { - mutex_lock(&ctx->uring_lock); - io_submit_flush_completions(ctx); + if (*locked) { + if (ctx->submit_state.compl_nr) + io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); + *locked = false; } percpu_ref_put(&ctx->refs); } static void tctx_task_work(struct callback_head *cb) { + bool locked = false; struct io_ring_ctx *ctx = NULL; struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work); @@ -2017,18 +2105,20 @@ static void tctx_task_work(struct callback_head *cb) io_task_work.node); if (req->ctx != ctx) { - ctx_flush_and_put(ctx); + ctx_flush_and_put(ctx, &locked); ctx = req->ctx; + /* if not contended, grab and improve batching */ + locked = mutex_trylock(&ctx->uring_lock); percpu_ref_get(&ctx->refs); } - req->io_task_work.func(req); + req->io_task_work.func(req, &locked); node = next; } while (node); cond_resched(); } - ctx_flush_and_put(ctx); + ctx_flush_and_put(ctx, &locked); } static void io_req_task_work_add(struct io_kiocb *req) @@ -2080,27 +2170,25 @@ static void io_req_task_work_add(struct io_kiocb *req) } } -static void io_req_task_cancel(struct io_kiocb *req) +static void io_req_task_cancel(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; - /* ctx is guaranteed to stay alive while we hold uring_lock */ - mutex_lock(&ctx->uring_lock); + /* not needed for normal modes, but SQPOLL depends on it */ + io_tw_lock(ctx, locked); io_req_complete_failed(req, req->result); - mutex_unlock(&ctx->uring_lock); } -static void io_req_task_submit(struct io_kiocb *req) +static void io_req_task_submit(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; - /* ctx stays valid until unlock, even if we drop all ours ctx->refs */ - mutex_lock(&ctx->uring_lock); + io_tw_lock(ctx, locked); + /* req->task == current here, checking PF_EXITING is safe */ if (likely(!(req->task->flags & PF_EXITING))) __io_queue_sqe(req); else io_req_complete_failed(req, -EFAULT); - mutex_unlock(&ctx->uring_lock); } static void io_req_task_queue_fail(struct io_kiocb *req, int ret) @@ -2136,6 +2224,11 @@ static void io_free_req(struct io_kiocb *req) __io_free_req(req); } +static void io_free_req_work(struct io_kiocb *req, bool *locked) +{ + io_free_req(req); +} + struct req_batch { struct task_struct *task; int task_refs; @@ -2154,9 +2247,7 @@ static void io_req_free_batch_finish(struct io_ring_ctx *ctx, { if (rb->ctx_refs) percpu_ref_put_many(&ctx->refs, rb->ctx_refs); - if (rb->task == current) - current->io_uring->cached_refs += rb->task_refs; - else if (rb->task) + if (rb->task) io_put_task(rb->task, rb->task_refs); } @@ -2182,7 +2273,7 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req, } static void io_submit_flush_completions(struct io_ring_ctx *ctx) - __must_hold(&req->ctx->uring_lock) + __must_hold(&ctx->uring_lock) { struct io_submit_state *state = &ctx->submit_state; int i, nr = state->compl_nr; @@ -2235,7 +2326,7 @@ static inline void io_put_req(struct io_kiocb *req) static inline void io_put_req_deferred(struct io_kiocb *req) { if (req_ref_put_and_test(req)) { - req->io_task_work.func = io_free_req; + req->io_task_work.func = io_free_req_work; io_req_task_work_add(req); } } @@ -2270,6 +2361,8 @@ static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req) { struct io_buffer *kbuf; + if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) + return 0; kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; return io_put_kbuf(req, kbuf); } @@ -2289,7 +2382,7 @@ static inline bool io_run_task_work(void) * Find and free completed poll iocbs */ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, - struct list_head *done, bool resubmit) + struct list_head *done) { struct req_batch rb; struct io_kiocb *req; @@ -2299,22 +2392,18 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, io_init_req_batch(&rb); while (!list_empty(done)) { - int cflags = 0; - req = list_first_entry(done, struct io_kiocb, inflight_entry); list_del(&req->inflight_entry); - if (READ_ONCE(req->result) == -EAGAIN && resubmit && + if (READ_ONCE(req->result) == -EAGAIN && !(req->flags & REQ_F_DONT_REISSUE)) { req->iopoll_completed = 0; io_req_task_queue_reissue(req); continue; } - if (req->flags & REQ_F_BUFFER_SELECTED) - cflags = io_put_rw_kbuf(req); - - __io_cqring_fill_event(ctx, req->user_data, req->result, cflags); + __io_cqring_fill_event(ctx, req->user_data, req->result, + io_put_rw_kbuf(req)); (*nr_events)++; if (req_ref_put_and_test(req)) @@ -2327,7 +2416,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, } static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, - long min, bool resubmit) + long min) { struct io_kiocb *req, *tmp; LIST_HEAD(done); @@ -2367,7 +2456,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, } if (!list_empty(&done)) - io_iopoll_complete(ctx, nr_events, &done, resubmit); + io_iopoll_complete(ctx, nr_events, &done); return 0; } @@ -2385,7 +2474,7 @@ static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) while (!list_empty(&ctx->iopoll_list)) { unsigned int nr_events = 0; - io_do_iopoll(ctx, &nr_events, 0, false); + io_do_iopoll(ctx, &nr_events, 0); /* let it sleep and repeat later if can't complete a request */ if (nr_events == 0) @@ -2447,7 +2536,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) list_empty(&ctx->iopoll_list)) break; } - ret = io_do_iopoll(ctx, &nr_events, min, true); + ret = io_do_iopoll(ctx, &nr_events, min); } while (!ret && nr_events < min && !need_resched()); out: mutex_unlock(&ctx->uring_lock); @@ -2532,13 +2621,22 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res) return false; } -static void io_req_task_complete(struct io_kiocb *req) +static void io_req_task_complete(struct io_kiocb *req, bool *locked) { - int cflags = 0; + unsigned int cflags = io_put_rw_kbuf(req); + long res = req->result; - if (req->flags & REQ_F_BUFFER_SELECTED) - cflags = io_put_rw_kbuf(req); - __io_req_complete(req, 0, req->result, cflags); + if (*locked) { + struct io_ring_ctx *ctx = req->ctx; + struct io_submit_state *state = &ctx->submit_state; + + io_req_complete_state(req, res, cflags); + state->compl_reqs[state->compl_nr++] = req; + if (state->compl_nr == ARRAY_SIZE(state->compl_reqs)) + io_submit_flush_completions(ctx); + } else { + io_req_complete_post(req, res, cflags); + } } static void __io_complete_rw(struct io_kiocb *req, long res, long res2, @@ -2546,7 +2644,7 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2, { if (__io_complete_rw_common(req, res)) return; - io_req_task_complete(req); + __io_req_complete(req, 0, req->result, io_put_rw_kbuf(req)); } static void io_complete_rw(struct kiocb *kiocb, long res, long res2) @@ -2806,12 +2904,9 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, if (io_resubmit_prep(req)) { io_req_task_queue_reissue(req); } else { - int cflags = 0; - req_set_fail(req); - if (req->flags & REQ_F_BUFFER_SELECTED) - cflags = io_put_rw_kbuf(req); - __io_req_complete(req, issue_flags, ret, cflags); + __io_req_complete(req, issue_flags, ret, + io_put_rw_kbuf(req)); } } } @@ -3493,7 +3588,7 @@ static int io_renameat_prep(struct io_kiocb *req, if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->ioprio || sqe->buf_index) + if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -3544,7 +3639,8 @@ static int io_unlinkat_prep(struct io_kiocb *req, if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index) + if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || + sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -3590,8 +3686,8 @@ static int io_shutdown_prep(struct io_kiocb *req, #if defined(CONFIG_NET) if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags || - sqe->buf_index) + if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags || + sqe->buf_index || sqe->splice_fd_in)) return -EINVAL; req->shutdown.how = READ_ONCE(sqe->len); @@ -3739,7 +3835,8 @@ static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) + if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || + sqe->splice_fd_in)) return -EINVAL; req->sync.flags = READ_ONCE(sqe->fsync_flags); @@ -3772,7 +3869,8 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags) static int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (sqe->ioprio || sqe->buf_index || sqe->rw_flags) + if (sqe->ioprio || sqe->buf_index || sqe->rw_flags || + sqe->splice_fd_in) return -EINVAL; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -3822,6 +3920,11 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe req->open.filename = NULL; return ret; } + + req->open.file_slot = READ_ONCE(sqe->file_index); + if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC)) + return -EINVAL; + req->open.nofile = rlimit(RLIMIT_NOFILE); req->flags |= REQ_F_NEED_CLEANUP; return 0; @@ -3859,8 +3962,8 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) { struct open_flags op; struct file *file; - bool nonblock_set; - bool resolve_nonblock; + bool resolve_nonblock, nonblock_set; + bool fixed = !!req->open.file_slot; int ret; ret = build_open_flags(&req->open.how, &op); @@ -3879,9 +3982,11 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) op.open_flag |= O_NONBLOCK; } - ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile); - if (ret < 0) - goto err; + if (!fixed) { + ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile); + if (ret < 0) + goto err; + } file = do_filp_open(req->open.dfd, req->open.filename, &op); if (IS_ERR(file)) { @@ -3890,7 +3995,8 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) * marginal gain for something that is now known to be a slower * path. So just put it, and we'll get a new one when we retry. */ - put_unused_fd(ret); + if (!fixed) + put_unused_fd(ret); ret = PTR_ERR(file); /* only retry if RESOLVE_CACHED wasn't already set by application */ @@ -3903,7 +4009,12 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set) file->f_flags &= ~O_NONBLOCK; fsnotify_open(file); - fd_install(ret, file); + + if (!fixed) + fd_install(ret, file); + else + ret = io_install_fixed_file(req, file, issue_flags, + req->open.file_slot - 1); err: putname(req->open.filename); req->flags &= ~REQ_F_NEED_CLEANUP; @@ -3924,7 +4035,8 @@ static int io_remove_buffers_prep(struct io_kiocb *req, struct io_provide_buf *p = &req->pbuf; u64 tmp; - if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off) + if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off || + sqe->splice_fd_in) return -EINVAL; tmp = READ_ONCE(sqe->fd); @@ -3995,7 +4107,7 @@ static int io_provide_buffers_prep(struct io_kiocb *req, struct io_provide_buf *p = &req->pbuf; u64 tmp; - if (sqe->ioprio || sqe->rw_flags) + if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; tmp = READ_ONCE(sqe->fd); @@ -4082,7 +4194,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_EPOLL) - if (sqe->ioprio || sqe->buf_index) + if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -4128,7 +4240,7 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) - if (sqe->ioprio || sqe->buf_index || sqe->off) + if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -4163,7 +4275,7 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags) static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (sqe->ioprio || sqe->buf_index || sqe->addr) + if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -4201,7 +4313,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->ioprio || sqe->buf_index) + if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) return -EBADF; @@ -4237,7 +4349,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || - sqe->rw_flags || sqe->buf_index) + sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) return -EBADF; @@ -4298,7 +4410,8 @@ static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) + if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || + sqe->splice_fd_in)) return -EINVAL; req->sync.off = READ_ONCE(sqe->off); @@ -4732,6 +4845,15 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); accept->flags = READ_ONCE(sqe->accept_flags); accept->nofile = rlimit(RLIMIT_NOFILE); + + accept->file_slot = READ_ONCE(sqe->file_index); + if (accept->file_slot && ((req->open.how.flags & O_CLOEXEC) || + (accept->flags & SOCK_CLOEXEC))) + return -EINVAL; + if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) + return -EINVAL; + if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) + accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; return 0; } @@ -4740,20 +4862,35 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) struct io_accept *accept = &req->accept; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; - int ret; + bool fixed = !!accept->file_slot; + struct file *file; + int ret, fd; if (req->file->f_flags & O_NONBLOCK) req->flags |= REQ_F_NOWAIT; - ret = __sys_accept4_file(req->file, file_flags, accept->addr, - accept->addr_len, accept->flags, - accept->nofile); - if (ret == -EAGAIN && force_nonblock) - return -EAGAIN; - if (ret < 0) { + if (!fixed) { + fd = __get_unused_fd_flags(accept->flags, accept->nofile); + if (unlikely(fd < 0)) + return fd; + } + file = do_accept(req->file, file_flags, accept->addr, accept->addr_len, + accept->flags); + if (IS_ERR(file)) { + if (!fixed) + put_unused_fd(fd); + ret = PTR_ERR(file); + if (ret == -EAGAIN && force_nonblock) + return -EAGAIN; if (ret == -ERESTARTSYS) ret = -EINTR; req_set_fail(req); + } else if (!fixed) { + fd_install(fd, file); + ret = fd; + } else { + ret = io_install_fixed_file(req, file, issue_flags, + accept->file_slot - 1); } __io_req_complete(req, issue_flags, ret, 0); return 0; @@ -4773,7 +4910,8 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) + if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags || + sqe->splice_fd_in) return -EINVAL; conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); @@ -4886,6 +5024,7 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll) { struct io_ring_ctx *ctx = req->ctx; + /* req->task == current here, checking PF_EXITING is safe */ if (unlikely(req->task->flags & PF_EXITING)) WRITE_ONCE(poll->canceled, true); @@ -4964,7 +5103,7 @@ static bool io_poll_complete(struct io_kiocb *req, __poll_t mask) return !(flags & IORING_CQE_F_MORE); } -static void io_poll_task_func(struct io_kiocb *req) +static void io_poll_task_func(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *nxt; @@ -4988,7 +5127,7 @@ static void io_poll_task_func(struct io_kiocb *req) if (done) { nxt = io_put_req_find_next(req); if (nxt) - io_req_task_submit(nxt); + io_req_task_submit(nxt, locked); } } } @@ -5055,8 +5194,13 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, if (unlikely(pt->nr_entries)) { struct io_poll_iocb *poll_one = poll; + /* double add on the same waitqueue head, ignore */ + if (poll_one->head == head) + return; /* already have a 2nd entry, fail a third attempt */ if (*poll_ptr) { + if ((*poll_ptr)->head == head) + return; pt->error = -EINVAL; return; } @@ -5066,9 +5210,6 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, */ if (!(poll_one->events & EPOLLONESHOT)) poll_one->events |= EPOLLONESHOT; - /* double add on the same waitqueue head, ignore */ - if (poll_one->head == head) - return; poll = kmalloc(sizeof(*poll), GFP_ATOMIC); if (!poll) { pt->error = -ENOMEM; @@ -5098,7 +5239,7 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); } -static void io_async_task_func(struct io_kiocb *req) +static void io_async_task_func(struct io_kiocb *req, bool *locked) { struct async_poll *apoll = req->apoll; struct io_ring_ctx *ctx = req->ctx; @@ -5115,7 +5256,7 @@ static void io_async_task_func(struct io_kiocb *req) spin_unlock(&ctx->completion_lock); if (!READ_ONCE(apoll->poll.canceled)) - io_req_task_submit(req); + io_req_task_submit(req, locked); else io_req_complete_failed(req, -ECANCELED); } @@ -5233,17 +5374,14 @@ static int io_arm_poll_handler(struct io_kiocb *req) req->apoll = apoll; req->flags |= REQ_F_POLLED; ipt.pt._qproc = io_async_queue_proc; - io_req_refcount(req); + io_req_set_refcount(req); ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, io_async_wake); - if (ret || ipt.error) { - spin_unlock(&ctx->completion_lock); - if (ret) - return IO_APOLL_READY; - return IO_APOLL_ABORTED; - } spin_unlock(&ctx->completion_lock); + if (ret || ipt.error) + return ret ? IO_APOLL_READY : IO_APOLL_ABORTED; + trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data, mask, apoll->poll.events); return IO_APOLL_OK; @@ -5369,7 +5507,7 @@ static int io_poll_update_prep(struct io_kiocb *req, if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->ioprio || sqe->buf_index) + if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; flags = READ_ONCE(sqe->len); if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA | @@ -5424,7 +5562,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe if (flags & ~IORING_POLL_ADD_MULTI) return -EINVAL; - io_req_refcount(req); + io_req_set_refcount(req); poll->events = io_poll_parse_events(sqe, flags); return 0; } @@ -5517,18 +5655,10 @@ err: return 0; } -static void io_req_task_timeout(struct io_kiocb *req) +static void io_req_task_timeout(struct io_kiocb *req, bool *locked) { - struct io_ring_ctx *ctx = req->ctx; - - spin_lock(&ctx->completion_lock); - io_cqring_fill_event(ctx, req->user_data, -ETIME, 0); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - - io_cqring_ev_posted(ctx); req_set_fail(req); - io_put_req(req); + io_req_complete_post(req, -ETIME, 0); } static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) @@ -5574,6 +5704,7 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, } static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) + __must_hold(&ctx->completion_lock) __must_hold(&ctx->timeout_lock) { struct io_kiocb *req = io_timeout_extract(ctx, user_data); @@ -5587,6 +5718,47 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) return 0; } +static clockid_t io_timeout_get_clock(struct io_timeout_data *data) +{ + switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) { + case IORING_TIMEOUT_BOOTTIME: + return CLOCK_BOOTTIME; + case IORING_TIMEOUT_REALTIME: + return CLOCK_REALTIME; + default: + /* can't happen, vetted at prep time */ + WARN_ON_ONCE(1); + fallthrough; + case 0: + return CLOCK_MONOTONIC; + } +} + +static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, + struct timespec64 *ts, enum hrtimer_mode mode) + __must_hold(&ctx->timeout_lock) +{ + struct io_timeout_data *io; + struct io_kiocb *req; + bool found = false; + + list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) { + found = user_data == req->user_data; + if (found) + break; + } + if (!found) + return -ENOENT; + + io = req->async_data; + if (hrtimer_try_to_cancel(&io->timer) == -1) + return -EALREADY; + hrtimer_init(&io->timer, io_timeout_get_clock(io), mode); + io->timer.function = io_link_timeout_fn; + hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode); + return 0; +} + static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, struct timespec64 *ts, enum hrtimer_mode mode) __must_hold(&ctx->timeout_lock) @@ -5600,7 +5772,7 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, req->timeout.off = 0; /* noseq */ data = req->async_data; list_add_tail(&req->timeout.list, &ctx->timeout_list); - hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode); + hrtimer_init(&data->timer, io_timeout_get_clock(data), mode); data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode); return 0; @@ -5615,13 +5787,18 @@ static int io_timeout_remove_prep(struct io_kiocb *req, return -EINVAL; if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->len) + if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in) return -EINVAL; + tr->ltimeout = false; tr->addr = READ_ONCE(sqe->addr); tr->flags = READ_ONCE(sqe->timeout_flags); - if (tr->flags & IORING_TIMEOUT_UPDATE) { - if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS)) + if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) { + if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1) + return -EINVAL; + if (tr->flags & IORING_LINK_TIMEOUT_UPDATE) + tr->ltimeout = true; + if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS)) return -EINVAL; if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2))) return -EFAULT; @@ -5648,22 +5825,26 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) struct io_ring_ctx *ctx = req->ctx; int ret; - spin_lock_irq(&ctx->timeout_lock); - if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) + if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) { + spin_lock(&ctx->completion_lock); + spin_lock_irq(&ctx->timeout_lock); ret = io_timeout_cancel(ctx, tr->addr); - else - ret = io_timeout_update(ctx, tr->addr, &tr->ts, - io_translate_timeout_mode(tr->flags)); - spin_unlock_irq(&ctx->timeout_lock); + spin_unlock_irq(&ctx->timeout_lock); + spin_unlock(&ctx->completion_lock); + } else { + enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags); + + spin_lock_irq(&ctx->timeout_lock); + if (tr->ltimeout) + ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode); + else + ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); + spin_unlock_irq(&ctx->timeout_lock); + } - spin_lock(&ctx->completion_lock); - io_cqring_fill_event(ctx, req->user_data, ret, 0); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); if (ret < 0) req_set_fail(req); - io_put_req(req); + io_req_complete_post(req, ret, 0); return 0; } @@ -5676,14 +5857,19 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->len != 1) + if (sqe->ioprio || sqe->buf_index || sqe->len != 1 || + sqe->splice_fd_in) return -EINVAL; if (off && is_timeout_link) return -EINVAL; flags = READ_ONCE(sqe->timeout_flags); - if (flags & ~IORING_TIMEOUT_ABS) + if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK)) + return -EINVAL; + /* more than one clock specified is invalid, obviously */ + if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) return -EINVAL; + INIT_LIST_HEAD(&req->timeout.list); req->timeout.off = off; if (unlikely(off && !req->ctx->off_timeout_used)) req->ctx->off_timeout_used = true; @@ -5693,14 +5879,24 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, data = req->async_data; data->req = req; + data->flags = flags; if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) return -EFAULT; data->mode = io_translate_timeout_mode(flags); - hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); - if (is_timeout_link) - io_req_track_inflight(req); + hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); + + if (is_timeout_link) { + struct io_submit_link *link = &req->ctx->submit_state.link; + + if (!link->head) + return -EINVAL; + if (link->last->opcode == IORING_OP_LINK_TIMEOUT) + return -EINVAL; + req->timeout.head = link->last; + link->last->flags |= REQ_F_ARM_LTIMEOUT; + } return 0; } @@ -5793,32 +5989,27 @@ static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data, return ret; } -static void io_async_find_and_cancel(struct io_ring_ctx *ctx, - struct io_kiocb *req, __u64 sqe_addr, - int success_ret) +static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr) { + struct io_ring_ctx *ctx = req->ctx; int ret; + WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current); + ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); - spin_lock(&ctx->completion_lock); if (ret != -ENOENT) - goto done; + return ret; + + spin_lock(&ctx->completion_lock); spin_lock_irq(&ctx->timeout_lock); ret = io_timeout_cancel(ctx, sqe_addr); spin_unlock_irq(&ctx->timeout_lock); if (ret != -ENOENT) - goto done; + goto out; ret = io_poll_cancel(ctx, sqe_addr, false); -done: - if (!ret) - ret = success_ret; - io_cqring_fill_event(ctx, req->user_data, ret, 0); - io_commit_cqring(ctx); +out: spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); - - if (ret < 0) - req_set_fail(req); + return ret; } static int io_async_cancel_prep(struct io_kiocb *req, @@ -5828,7 +6019,8 @@ static int io_async_cancel_prep(struct io_kiocb *req, return -EINVAL; if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags) + if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags || + sqe->splice_fd_in) return -EINVAL; req->cancel.addr = READ_ONCE(sqe->addr); @@ -5842,20 +6034,9 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) struct io_tctx_node *node; int ret; - /* tasks should wait for their io-wq threads, so safe w/o sync */ - ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); - spin_lock(&ctx->completion_lock); - if (ret != -ENOENT) - goto done; - spin_lock_irq(&ctx->timeout_lock); - ret = io_timeout_cancel(ctx, sqe_addr); - spin_unlock_irq(&ctx->timeout_lock); + ret = io_try_cancel_userdata(req, sqe_addr); if (ret != -ENOENT) goto done; - ret = io_poll_cancel(ctx, sqe_addr, false); - if (ret != -ENOENT) - goto done; - spin_unlock(&ctx->completion_lock); /* slow path, try all io-wq's */ io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); @@ -5868,17 +6049,10 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) break; } io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); - - spin_lock(&ctx->completion_lock); done: - io_cqring_fill_event(ctx, req->user_data, ret, 0); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); - if (ret < 0) req_set_fail(req); - io_put_req(req); + io_req_complete_post(req, ret, 0); return 0; } @@ -5887,7 +6061,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req, { if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; - if (sqe->ioprio || sqe->rw_flags) + if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; req->rsrc_update.offset = READ_ONCE(sqe->off); @@ -6093,7 +6267,7 @@ fail: if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { spin_unlock(&ctx->completion_lock); kfree(de); - io_queue_async_work(req); + io_queue_async_work(req, NULL); return true; } @@ -6316,14 +6490,17 @@ static void io_wq_submit_work(struct io_wq_work *work) struct io_kiocb *timeout; int ret = 0; - io_req_refcount(req); - /* will be dropped by ->io_free_work() after returning to io-wq */ - req_ref_get(req); + /* one will be dropped by ->io_free_work() after returning to io-wq */ + if (!(req->flags & REQ_F_REFCOUNT)) + __io_req_set_refcount(req, 2); + else + req_ref_get(req); timeout = io_prep_linked_timeout(req); if (timeout) io_queue_linked_timeout(timeout); + /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ if (work->flags & IO_WQ_WORK_CANCEL) ret = -ECANCELED; @@ -6413,15 +6590,15 @@ static inline struct file *io_file_get(struct io_ring_ctx *ctx, return io_file_get_normal(ctx, req, fd); } -static void io_req_task_link_timeout(struct io_kiocb *req) +static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) { struct io_kiocb *prev = req->timeout.prev; - struct io_ring_ctx *ctx = req->ctx; + int ret; if (prev) { - io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME); + ret = io_try_cancel_userdata(req, prev->user_data); + io_req_complete_post(req, ret ?: -ETIME, 0); io_put_req(prev); - io_put_req(req); } else { io_req_complete_post(req, -ETIME, 0); } @@ -6448,6 +6625,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) if (!req_ref_inc_not_zero(prev)) prev = NULL; } + list_del(&req->timeout.list); req->timeout.prev = prev; spin_unlock_irqrestore(&ctx->timeout_lock, flags); @@ -6471,6 +6649,7 @@ static void io_queue_linked_timeout(struct io_kiocb *req) data->timer.function = io_link_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); + list_add_tail(&req->timeout.list, &ctx->ltimeout_list); } spin_unlock_irq(&ctx->timeout_lock); /* drop submission reference */ @@ -6480,7 +6659,7 @@ static void io_queue_linked_timeout(struct io_kiocb *req) static void __io_queue_sqe(struct io_kiocb *req) __must_hold(&req->ctx->uring_lock) { - struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); + struct io_kiocb *linked_timeout; int ret; issue_sqe: @@ -6498,24 +6677,34 @@ issue_sqe: state->compl_reqs[state->compl_nr++] = req; if (state->compl_nr == ARRAY_SIZE(state->compl_reqs)) io_submit_flush_completions(ctx); + return; } + + linked_timeout = io_prep_linked_timeout(req); + if (linked_timeout) + io_queue_linked_timeout(linked_timeout); } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { + linked_timeout = io_prep_linked_timeout(req); + switch (io_arm_poll_handler(req)) { case IO_APOLL_READY: + if (linked_timeout) + io_unprep_linked_timeout(req); goto issue_sqe; case IO_APOLL_ABORTED: /* * Queued up for async execution, worker will release * submit reference when the iocb is actually submitted. */ - io_queue_async_work(req); + io_queue_async_work(req, NULL); break; } + + if (linked_timeout) + io_queue_linked_timeout(linked_timeout); } else { io_req_complete_failed(req, ret); } - if (linked_timeout) - io_queue_linked_timeout(linked_timeout); } static inline void io_queue_sqe(struct io_kiocb *req) @@ -6524,15 +6713,17 @@ static inline void io_queue_sqe(struct io_kiocb *req) if (unlikely(req->ctx->drain_active) && io_drain_req(req)) return; - if (likely(!(req->flags & REQ_F_FORCE_ASYNC))) { + if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) { __io_queue_sqe(req); + } else if (req->flags & REQ_F_FAIL) { + io_req_complete_failed(req, req->result); } else { int ret = io_req_prep_async(req); if (unlikely(ret)) io_req_complete_failed(req, ret); else - io_queue_async_work(req); + io_queue_async_work(req, NULL); } } @@ -6634,20 +6825,34 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, ret = io_init_req(ctx, req, sqe); if (unlikely(ret)) { fail_req: + /* fail even hard links since we don't submit */ if (link->head) { - /* fail even hard links since we don't submit */ - req_set_fail(link->head); - io_req_complete_failed(link->head, -ECANCELED); - link->head = NULL; + /* + * we can judge a link req is failed or cancelled by if + * REQ_F_FAIL is set, but the head is an exception since + * it may be set REQ_F_FAIL because of other req's failure + * so let's leverage req->result to distinguish if a head + * is set REQ_F_FAIL because of its failure or other req's + * failure so that we can set the correct ret code for it. + * init result here to avoid affecting the normal path. + */ + if (!(link->head->flags & REQ_F_FAIL)) + req_fail_link_node(link->head, -ECANCELED); + } else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { + /* + * the current req is a normal req, we should return + * error and thus break the submittion loop. + */ + io_req_complete_failed(req, ret); + return ret; } - io_req_complete_failed(req, ret); - return ret; + req_fail_link_node(req, ret); + } else { + ret = io_req_prep(req, sqe); + if (unlikely(ret)) + goto fail_req; } - ret = io_req_prep(req, sqe); - if (unlikely(ret)) - goto fail_req; - /* don't need @sqe from now on */ trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data, req->flags, true, @@ -6663,9 +6868,14 @@ fail_req: if (link->head) { struct io_kiocb *head = link->head; - ret = io_req_prep_async(req); - if (unlikely(ret)) - goto fail_req; + if (!(req->flags & REQ_F_FAIL)) { + ret = io_req_prep_async(req); + if (unlikely(ret)) { + req_fail_link_node(req, ret); + if (!(head->flags & REQ_F_FAIL)) + req_fail_link_node(head, -ECANCELED); + } + } trace_io_uring_link(ctx, req, head); link->last->link = req; link->last = req; @@ -6760,25 +6970,15 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) __must_hold(&ctx->uring_lock) { - struct io_uring_task *tctx; int submitted = 0; /* make sure SQ entry isn't read before tail */ nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx)); if (!percpu_ref_tryget_many(&ctx->refs, nr)) return -EAGAIN; + io_get_task_refs(nr); - tctx = current->io_uring; - tctx->cached_refs -= nr; - if (unlikely(tctx->cached_refs < 0)) { - unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR; - - percpu_counter_add(&tctx->inflight, refill); - refcount_add(refill, ¤t->usage); - tctx->cached_refs += refill; - } io_submit_state_start(&ctx->submit_state, nr); - while (submitted < nr) { const struct io_uring_sqe *sqe; struct io_kiocb *req; @@ -6791,7 +6991,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) } sqe = io_get_sqe(ctx); if (unlikely(!sqe)) { - kmem_cache_free(req_cachep, req); + list_add(&req->inflight_entry, &ctx->submit_state.free_list); break; } /* will complete beyond this point, count as submitted */ @@ -6856,7 +7056,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) mutex_lock(&ctx->uring_lock); if (!list_empty(&ctx->iopoll_list)) - io_do_iopoll(ctx, &nr_events, 0, true); + io_do_iopoll(ctx, &nr_events, 0); /* * Don't submit if refs are dying, good for io_uring_register(), @@ -7136,14 +7336,14 @@ static void **io_alloc_page_table(size_t size) size_t init_size = size; void **table; - table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL); + table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT); if (!table) return NULL; for (i = 0; i < nr_tables; i++) { unsigned int this_size = min_t(size_t, size, PAGE_SIZE); - table[i] = kzalloc(this_size, GFP_KERNEL); + table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT); if (!table[i]) { io_free_page_table(table, init_size); return NULL; @@ -7334,7 +7534,8 @@ fail: static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files) { - table->files = kvcalloc(nr_files, sizeof(table->files[0]), GFP_KERNEL); + table->files = kvcalloc(nr_files, sizeof(table->files[0]), + GFP_KERNEL_ACCOUNT); return !!table->files; } @@ -7731,6 +7932,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return -EINVAL; if (nr_args > IORING_MAX_FIXED_FILES) return -EMFILE; + if (nr_args > rlimit(RLIMIT_NOFILE)) + return -EMFILE; ret = io_rsrc_node_switch_start(ctx); if (ret) return ret; @@ -7840,6 +8043,46 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, #endif } +static int io_install_fixed_file(struct io_kiocb *req, struct file *file, + unsigned int issue_flags, u32 slot_index) +{ + struct io_ring_ctx *ctx = req->ctx; + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + struct io_fixed_file *file_slot; + int ret = -EBADF; + + io_ring_submit_lock(ctx, !force_nonblock); + if (file->f_op == &io_uring_fops) + goto err; + ret = -ENXIO; + if (!ctx->file_data) + goto err; + ret = -EINVAL; + if (slot_index >= ctx->nr_user_files) + goto err; + + slot_index = array_index_nospec(slot_index, ctx->nr_user_files); + file_slot = io_fixed_file_slot(&ctx->file_table, slot_index); + ret = -EBADF; + if (file_slot->file_ptr) + goto err; + + *io_get_tag_slot(ctx->file_data, slot_index) = 0; + io_fixed_file_set(file_slot, file); + ret = io_sqe_file_register(ctx, file, slot_index); + if (ret) { + file_slot->file_ptr = 0; + goto err; + } + + ret = 0; +err: + io_ring_submit_unlock(ctx, !force_nonblock); + if (ret) + fput(file); + return ret; +} + static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, struct io_rsrc_node *node, void *rsrc) { @@ -8699,6 +8942,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) sock_release(ctx->ring_sock); } #endif + WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); io_mem_free(ctx->rings); io_mem_free(ctx->sq_sqes); @@ -9126,8 +9370,8 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx) * Must be after io_uring_del_task_file() (removes nodes under * uring_lock) to avoid race with io_uring_try_cancel_iowq(). */ - tctx->io_wq = NULL; io_wq_put_and_exit(wq); + tctx->io_wq = NULL; } } @@ -9213,9 +9457,9 @@ static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) } } -void __io_uring_cancel(struct files_struct *files) +void __io_uring_cancel(bool cancel_all) { - io_uring_cancel_generic(!files, NULL); + io_uring_cancel_generic(cancel_all, NULL); } static void *io_uring_validate_mmap_request(struct file *file, @@ -10053,6 +10297,31 @@ static int io_unregister_iowq_aff(struct io_ring_ctx *ctx) return io_wq_cpu_affinity(tctx->io_wq, NULL); } +static int io_register_iowq_max_workers(struct io_ring_ctx *ctx, + void __user *arg) +{ + struct io_uring_task *tctx = current->io_uring; + __u32 new_count[2]; + int i, ret; + + if (!tctx || !tctx->io_wq) + return -EINVAL; + if (copy_from_user(new_count, arg, sizeof(new_count))) + return -EFAULT; + for (i = 0; i < ARRAY_SIZE(new_count); i++) + if (new_count[i] > INT_MAX) + return -EINVAL; + + ret = io_wq_max_workers(tctx->io_wq, new_count); + if (ret) + return ret; + + if (copy_to_user(arg, new_count, sizeof(new_count))) + return -EFAULT; + + return 0; +} + static bool io_register_op_must_quiesce(int op) { switch (op) { @@ -10070,6 +10339,7 @@ static bool io_register_op_must_quiesce(int op) case IORING_REGISTER_BUFFERS_UPDATE: case IORING_REGISTER_IOWQ_AFF: case IORING_UNREGISTER_IOWQ_AFF: + case IORING_REGISTER_IOWQ_MAX_WORKERS: return false; default: return true; @@ -10226,6 +10496,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_unregister_iowq_aff(ctx); break; + case IORING_REGISTER_IOWQ_MAX_WORKERS: + ret = -EINVAL; + if (!arg || nr_args != 2) + break; + ret = io_register_iowq_max_workers(ctx, arg); + break; default: ret = -EINVAL; break; @@ -10307,11 +10583,16 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(40, __u16, buf_group); BUILD_BUG_SQE_ELEM(42, __u16, personality); BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); + BUILD_BUG_SQE_ELEM(44, __u32, file_index); BUILD_BUG_ON(sizeof(struct io_uring_files_update) != sizeof(struct io_uring_rsrc_update)); BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) > sizeof(struct io_uring_rsrc_update2)); + + /* ->buf_index is u16 */ + BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); + /* should fit into one byte */ BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8)); diff --git a/fs/ioctl.c b/fs/ioctl.c index 1e2204fa9963..eea8267ae1f2 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -263,209 +263,6 @@ static long ioctl_file_clone_range(struct file *file, args.src_length, args.dest_offset); } -#ifdef CONFIG_BLOCK - -static inline sector_t logical_to_blk(struct inode *inode, loff_t offset) -{ - return (offset >> inode->i_blkbits); -} - -static inline loff_t blk_to_logical(struct inode *inode, sector_t blk) -{ - return (blk << inode->i_blkbits); -} - -/** - * __generic_block_fiemap - FIEMAP for block based inodes (no locking) - * @inode: the inode to map - * @fieinfo: the fiemap info struct that will be passed back to userspace - * @start: where to start mapping in the inode - * @len: how much space to map - * @get_block: the fs's get_block function - * - * This does FIEMAP for block based inodes. Basically it will just loop - * through get_block until we hit the number of extents we want to map, or we - * go past the end of the file and hit a hole. - * - * If it is possible to have data blocks beyond a hole past @inode->i_size, then - * please do not use this function, it will stop at the first unmapped block - * beyond i_size. - * - * If you use this function directly, you need to do your own locking. Use - * generic_block_fiemap if you want the locking done for you. - */ -static int __generic_block_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, loff_t start, - loff_t len, get_block_t *get_block) -{ - struct buffer_head map_bh; - sector_t start_blk, last_blk; - loff_t isize = i_size_read(inode); - u64 logical = 0, phys = 0, size = 0; - u32 flags = FIEMAP_EXTENT_MERGED; - bool past_eof = false, whole_file = false; - int ret = 0; - - ret = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_SYNC); - if (ret) - return ret; - - /* - * Either the i_mutex or other appropriate locking needs to be held - * since we expect isize to not change at all through the duration of - * this call. - */ - if (len >= isize) { - whole_file = true; - len = isize; - } - - /* - * Some filesystems can't deal with being asked to map less than - * blocksize, so make sure our len is at least block length. - */ - if (logical_to_blk(inode, len) == 0) - len = blk_to_logical(inode, 1); - - start_blk = logical_to_blk(inode, start); - last_blk = logical_to_blk(inode, start + len - 1); - - do { - /* - * we set b_size to the total size we want so it will map as - * many contiguous blocks as possible at once - */ - memset(&map_bh, 0, sizeof(struct buffer_head)); - map_bh.b_size = len; - - ret = get_block(inode, start_blk, &map_bh, 0); - if (ret) - break; - - /* HOLE */ - if (!buffer_mapped(&map_bh)) { - start_blk++; - - /* - * We want to handle the case where there is an - * allocated block at the front of the file, and then - * nothing but holes up to the end of the file properly, - * to make sure that extent at the front gets properly - * marked with FIEMAP_EXTENT_LAST - */ - if (!past_eof && - blk_to_logical(inode, start_blk) >= isize) - past_eof = 1; - - /* - * First hole after going past the EOF, this is our - * last extent - */ - if (past_eof && size) { - flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST; - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, - flags); - } else if (size) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - size = 0; - } - - /* if we have holes up to/past EOF then we're done */ - if (start_blk > last_blk || past_eof || ret) - break; - } else { - /* - * We have gone over the length of what we wanted to - * map, and it wasn't the entire file, so add the extent - * we got last time and exit. - * - * This is for the case where say we want to map all the - * way up to the second to the last block in a file, but - * the last block is a hole, making the second to last - * block FIEMAP_EXTENT_LAST. In this case we want to - * see if there is a hole after the second to last block - * so we can mark it properly. If we found data after - * we exceeded the length we were requesting, then we - * are good to go, just add the extent to the fieinfo - * and break - */ - if (start_blk > last_blk && !whole_file) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, - flags); - break; - } - - /* - * if size != 0 then we know we already have an extent - * to add, so add it. - */ - if (size) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, - flags); - if (ret) - break; - } - - logical = blk_to_logical(inode, start_blk); - phys = blk_to_logical(inode, map_bh.b_blocknr); - size = map_bh.b_size; - flags = FIEMAP_EXTENT_MERGED; - - start_blk += logical_to_blk(inode, size); - - /* - * If we are past the EOF, then we need to make sure as - * soon as we find a hole that the last extent we found - * is marked with FIEMAP_EXTENT_LAST - */ - if (!past_eof && logical + size >= isize) - past_eof = true; - } - cond_resched(); - if (fatal_signal_pending(current)) { - ret = -EINTR; - break; - } - - } while (1); - - /* If ret is 1 then we just hit the end of the extent array */ - if (ret == 1) - ret = 0; - - return ret; -} - -/** - * generic_block_fiemap - FIEMAP for block based inodes - * @inode: The inode to map - * @fieinfo: The mapping information - * @start: The initial block to map - * @len: The length of the extect to attempt to map - * @get_block: The block mapping function for the fs - * - * Calls __generic_block_fiemap to map the inode, after taking - * the inode's mutex lock. - */ - -int generic_block_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, u64 start, - u64 len, get_block_t *get_block) -{ - int ret; - inode_lock(inode); - ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block); - inode_unlock(inode); - return ret; -} -EXPORT_SYMBOL(generic_block_fiemap); - -#endif /* CONFIG_BLOCK */ - /* * This provides compatibility with legacy XFS pre-allocation ioctls * which predate the fallocate syscall. diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 21edc423b79f..678e2c51b855 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -155,7 +155,6 @@ struct iso9660_options{ unsigned int overriderockperm:1; unsigned int uid_set:1; unsigned int gid_set:1; - unsigned int utf8:1; unsigned char map; unsigned char check; unsigned int blocksize; @@ -356,7 +355,6 @@ static int parse_options(char *options, struct iso9660_options *popt) popt->gid = GLOBAL_ROOT_GID; popt->uid = GLOBAL_ROOT_UID; popt->iocharset = NULL; - popt->utf8 = 0; popt->overriderockperm = 0; popt->session=-1; popt->sbsector=-1; @@ -389,10 +387,13 @@ static int parse_options(char *options, struct iso9660_options *popt) case Opt_cruft: popt->cruft = 1; break; +#ifdef CONFIG_JOLIET case Opt_utf8: - popt->utf8 = 1; + kfree(popt->iocharset); + popt->iocharset = kstrdup("utf8", GFP_KERNEL); + if (!popt->iocharset) + return 0; break; -#ifdef CONFIG_JOLIET case Opt_iocharset: kfree(popt->iocharset); popt->iocharset = match_strdup(&args[0]); @@ -495,7 +496,6 @@ static int isofs_show_options(struct seq_file *m, struct dentry *root) if (sbi->s_nocompress) seq_puts(m, ",nocompress"); if (sbi->s_overriderockperm) seq_puts(m, ",overriderockperm"); if (sbi->s_showassoc) seq_puts(m, ",showassoc"); - if (sbi->s_utf8) seq_puts(m, ",utf8"); if (sbi->s_check) seq_printf(m, ",check=%c", sbi->s_check); if (sbi->s_mapping) seq_printf(m, ",map=%c", sbi->s_mapping); @@ -518,9 +518,10 @@ static int isofs_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",fmode=%o", sbi->s_fmode); #ifdef CONFIG_JOLIET - if (sbi->s_nls_iocharset && - strcmp(sbi->s_nls_iocharset->charset, CONFIG_NLS_DEFAULT) != 0) + if (sbi->s_nls_iocharset) seq_printf(m, ",iocharset=%s", sbi->s_nls_iocharset->charset); + else + seq_puts(m, ",iocharset=utf8"); #endif return 0; } @@ -863,14 +864,13 @@ root_found: sbi->s_nls_iocharset = NULL; #ifdef CONFIG_JOLIET - if (joliet_level && opt.utf8 == 0) { + if (joliet_level) { char *p = opt.iocharset ? opt.iocharset : CONFIG_NLS_DEFAULT; - sbi->s_nls_iocharset = load_nls(p); - if (! sbi->s_nls_iocharset) { - /* Fail only if explicit charset specified */ - if (opt.iocharset) + if (strcmp(p, "utf8") != 0) { + sbi->s_nls_iocharset = opt.iocharset ? + load_nls(opt.iocharset) : load_nls_default(); + if (!sbi->s_nls_iocharset) goto out_freesbi; - sbi->s_nls_iocharset = load_nls_default(); } } #endif @@ -886,7 +886,6 @@ root_found: sbi->s_gid = opt.gid; sbi->s_uid_set = opt.uid_set; sbi->s_gid_set = opt.gid_set; - sbi->s_utf8 = opt.utf8; sbi->s_nocompress = opt.nocompress; sbi->s_overriderockperm = opt.overriderockperm; /* diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h index 055ec6c586f7..dcdc191ed183 100644 --- a/fs/isofs/isofs.h +++ b/fs/isofs/isofs.h @@ -44,7 +44,6 @@ struct isofs_sb_info { unsigned char s_session; unsigned int s_high_sierra:1; unsigned int s_rock:2; - unsigned int s_utf8:1; unsigned int s_cruft:1; /* Broken disks with high byte of length * containing junk */ unsigned int s_nocompress:1; diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c index be8b6a9d0b92..c0f04a1e7f69 100644 --- a/fs/isofs/joliet.c +++ b/fs/isofs/joliet.c @@ -41,14 +41,12 @@ uni16_to_x8(unsigned char *ascii, __be16 *uni, int len, struct nls_table *nls) int get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, struct inode * inode) { - unsigned char utf8; struct nls_table *nls; unsigned char len = 0; - utf8 = ISOFS_SB(inode->i_sb)->s_utf8; nls = ISOFS_SB(inode->i_sb)->s_nls_iocharset; - if (utf8) { + if (!nls) { len = utf16s_to_utf8s((const wchar_t *) de->name, de->name_len[0] >> 1, UTF16_BIG_ENDIAN, outname, PAGE_SIZE); diff --git a/fs/locks.c b/fs/locks.c index 74b2a1dfe8d8..3d6fb4ae847b 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1397,103 +1397,6 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl) return error; } -#ifdef CONFIG_MANDATORY_FILE_LOCKING -/** - * locks_mandatory_locked - Check for an active lock - * @file: the file to check - * - * Searches the inode's list of locks to find any POSIX locks which conflict. - * This function is called from locks_verify_locked() only. - */ -int locks_mandatory_locked(struct file *file) -{ - int ret; - struct inode *inode = locks_inode(file); - struct file_lock_context *ctx; - struct file_lock *fl; - - ctx = smp_load_acquire(&inode->i_flctx); - if (!ctx || list_empty_careful(&ctx->flc_posix)) - return 0; - - /* - * Search the lock list for this inode for any POSIX locks. - */ - spin_lock(&ctx->flc_lock); - ret = 0; - list_for_each_entry(fl, &ctx->flc_posix, fl_list) { - if (fl->fl_owner != current->files && - fl->fl_owner != file) { - ret = -EAGAIN; - break; - } - } - spin_unlock(&ctx->flc_lock); - return ret; -} - -/** - * locks_mandatory_area - Check for a conflicting lock - * @inode: the file to check - * @filp: how the file was opened (if it was) - * @start: first byte in the file to check - * @end: lastbyte in the file to check - * @type: %F_WRLCK for a write lock, else %F_RDLCK - * - * Searches the inode's list of locks to find any POSIX locks which conflict. - */ -int locks_mandatory_area(struct inode *inode, struct file *filp, loff_t start, - loff_t end, unsigned char type) -{ - struct file_lock fl; - int error; - bool sleep = false; - - locks_init_lock(&fl); - fl.fl_pid = current->tgid; - fl.fl_file = filp; - fl.fl_flags = FL_POSIX | FL_ACCESS; - if (filp && !(filp->f_flags & O_NONBLOCK)) - sleep = true; - fl.fl_type = type; - fl.fl_start = start; - fl.fl_end = end; - - for (;;) { - if (filp) { - fl.fl_owner = filp; - fl.fl_flags &= ~FL_SLEEP; - error = posix_lock_inode(inode, &fl, NULL); - if (!error) - break; - } - - if (sleep) - fl.fl_flags |= FL_SLEEP; - fl.fl_owner = current->files; - error = posix_lock_inode(inode, &fl, NULL); - if (error != FILE_LOCK_DEFERRED) - break; - error = wait_event_interruptible(fl.fl_wait, - list_empty(&fl.fl_blocked_member)); - if (!error) { - /* - * If we've been sleeping someone might have - * changed the permissions behind our back. - */ - if (__mandatory_lock(inode)) - continue; - } - - break; - } - locks_delete_block(&fl); - - return error; -} -EXPORT_SYMBOL(locks_mandatory_area); -#endif /* CONFIG_MANDATORY_FILE_LOCKING */ - static void lease_clear_pending(struct file_lock *fl, int arg) { switch (arg) { @@ -2486,14 +2389,6 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, if (file_lock == NULL) return -ENOLCK; - /* Don't allow mandatory locks on files that may be memory mapped - * and shared. - */ - if (mandatory_lock(inode) && mapping_writably_mapped(filp->f_mapping)) { - error = -EAGAIN; - goto out; - } - error = flock_to_posix_lock(filp, file_lock, flock); if (error) goto out; @@ -2611,21 +2506,12 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, struct flock64 *flock) { struct file_lock *file_lock = locks_alloc_lock(); - struct inode *inode = locks_inode(filp); struct file *f; int error; if (file_lock == NULL) return -ENOLCK; - /* Don't allow mandatory locks on files that may be memory mapped - * and shared. - */ - if (mandatory_lock(inode) && mapping_writably_mapped(filp->f_mapping)) { - error = -EAGAIN; - goto out; - } - error = flock64_to_posix_lock(filp, file_lock, flock); if (error) goto out; @@ -2857,8 +2743,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, seq_puts(f, "POSIX "); seq_printf(f, " %s ", - (inode == NULL) ? "*NOINODE*" : - mandatory_lock(inode) ? "MANDATORY" : "ADVISORY "); + (inode == NULL) ? "*NOINODE*" : "ADVISORY "); } else if (IS_FLOCK(fl)) { if (fl->fl_type & LOCK_MAND) { seq_puts(f, "FLOCK MSNFS "); diff --git a/fs/namei.c b/fs/namei.c index bf6d8a738c59..471eb9fead6e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3023,9 +3023,7 @@ static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp) /* * Refuse to truncate files with mandatory locks held on them. */ - error = locks_verify_locked(filp); - if (!error) - error = security_path_truncate(path); + error = security_path_truncate(path); if (!error) { error = do_truncate(mnt_userns, path->dentry, 0, ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, diff --git a/fs/namespace.c b/fs/namespace.c index 97adcb5ab5d5..20caa4b4c539 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1715,22 +1715,14 @@ static inline bool may_mount(void) return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); } -#ifdef CONFIG_MANDATORY_FILE_LOCKING -static bool may_mandlock(void) +static void warn_mandlock(void) { - pr_warn_once("======================================================\n" - "WARNING: the mand mount option is being deprecated and\n" - " will be removed in v5.15!\n" - "======================================================\n"); - return capable(CAP_SYS_ADMIN); + pr_warn_once("=======================================================\n" + "WARNING: The mand mount option has been deprecated and\n" + " and is ignored by this kernel. Remove the mand\n" + " option from the mount to silence this warning.\n" + "=======================================================\n"); } -#else -static inline bool may_mandlock(void) -{ - pr_warn("VFS: \"mand\" mount option not supported"); - return false; -} -#endif static int can_umount(const struct path *path, int flags) { @@ -3197,8 +3189,8 @@ int path_mount(const char *dev_name, struct path *path, return ret; if (!may_mount()) return -EPERM; - if ((flags & SB_MANDLOCK) && !may_mandlock()) - return -EPERM; + if (flags & SB_MANDLOCK) + warn_mandlock(); /* Default to relatime unless overriden */ if (!(flags & MS_NOATIME)) @@ -3581,9 +3573,8 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, if (fc->phase != FS_CONTEXT_AWAITING_MOUNT) goto err_unlock; - ret = -EPERM; - if ((fc->sb_flags & SB_MANDLOCK) && !may_mandlock()) - goto err_unlock; + if (fc->sb_flags & SB_MANDLOCK) + warn_mandlock(); newmount.mnt = vfs_create_mount(fc); if (IS_ERR(newmount.mnt)) { diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 1fef107961bc..514be5d28d70 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -806,10 +806,6 @@ int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) nfs_inc_stats(inode, NFSIOS_VFSLOCK); - /* No mandatory locks over NFS */ - if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) - goto out_err; - if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL) is_local = 1; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index fa67ecd5fe63..8313e1dbb5dc 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -5735,16 +5735,6 @@ check_special_stateids(struct net *net, svc_fh *current_fh, stateid_t *stateid, NFS4_SHARE_DENY_READ); } -/* - * Allow READ/WRITE during grace period on recovered state only for files - * that are not able to provide mandatory locking. - */ -static inline int -grace_disallows_io(struct net *net, struct inode *inode) -{ - return opens_in_grace(net) && mandatory_lock(inode); -} - static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session) { /* @@ -6026,7 +6016,6 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, stateid_t *stateid, int flags, struct nfsd_file **nfp, struct nfs4_stid **cstid) { - struct inode *ino = d_inode(fhp->fh_dentry); struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct nfs4_stid *s = NULL; @@ -6035,9 +6024,6 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, if (nfp) *nfp = NULL; - if (grace_disallows_io(net, ino)) - return nfserr_grace; - if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { status = check_special_stateids(net, fhp, stateid, flags); goto done; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index a224a5e23cc1..92e77f92268a 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -333,7 +333,6 @@ nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap) { struct inode *inode = d_inode(fhp->fh_dentry); - int host_err; if (iap->ia_size < inode->i_size) { __be32 err; @@ -343,20 +342,7 @@ nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp, if (err) return err; } - - host_err = get_write_access(inode); - if (host_err) - goto out_nfserrno; - - host_err = locks_verify_truncate(inode, NULL, iap->ia_size); - if (host_err) - goto out_put_write_access; - return 0; - -out_put_write_access: - put_write_access(inode); -out_nfserrno: - return nfserrno(host_err); + return nfserrno(get_write_access(inode)); } /* @@ -750,13 +736,6 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, err = nfserr_perm; if (IS_APPEND(inode) && (may_flags & NFSD_MAY_WRITE)) goto out; - /* - * We must ignore files (but only files) which might have mandatory - * locks on them because there is no way to know if the accesser has - * the lock. - */ - if (S_ISREG((inode)->i_mode) && mandatory_lock(inode)) - goto out; if (!inode->i_fop) goto out; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 4abd928b0bc8..f6b2d280aab5 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1053,7 +1053,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_time_gran = 1; sb->s_max_links = NILFS_LINK_MAX; - sb->s_bdi = bdi_get(sb->s_bdev->bd_bdi); + sb->s_bdi = bdi_get(sb->s_bdev->bd_disk->bdi); err = load_nilfs(nilfs, sb); if (err) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 28b67cb9458d..6facdf476255 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/fanotify.h> #include <linux/fcntl.h> +#include <linux/fdtable.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/anon_inodes.h> @@ -109,8 +110,10 @@ struct kmem_cache *fanotify_path_event_cachep __read_mostly; struct kmem_cache *fanotify_perm_event_cachep __read_mostly; #define FANOTIFY_EVENT_ALIGN 4 -#define FANOTIFY_INFO_HDR_LEN \ +#define FANOTIFY_FID_INFO_HDR_LEN \ (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle)) +#define FANOTIFY_PIDFD_INFO_HDR_LEN \ + sizeof(struct fanotify_event_info_pidfd) static int fanotify_fid_info_len(int fh_len, int name_len) { @@ -119,10 +122,11 @@ static int fanotify_fid_info_len(int fh_len, int name_len) if (name_len) info_len += name_len + 1; - return roundup(FANOTIFY_INFO_HDR_LEN + info_len, FANOTIFY_EVENT_ALIGN); + return roundup(FANOTIFY_FID_INFO_HDR_LEN + info_len, + FANOTIFY_EVENT_ALIGN); } -static int fanotify_event_info_len(unsigned int fid_mode, +static int fanotify_event_info_len(unsigned int info_mode, struct fanotify_event *event) { struct fanotify_info *info = fanotify_event_info(event); @@ -133,7 +137,8 @@ static int fanotify_event_info_len(unsigned int fid_mode, if (dir_fh_len) { info_len += fanotify_fid_info_len(dir_fh_len, info->name_len); - } else if ((fid_mode & FAN_REPORT_NAME) && (event->mask & FAN_ONDIR)) { + } else if ((info_mode & FAN_REPORT_NAME) && + (event->mask & FAN_ONDIR)) { /* * With group flag FAN_REPORT_NAME, if name was not recorded in * event on a directory, we will report the name ".". @@ -141,6 +146,9 @@ static int fanotify_event_info_len(unsigned int fid_mode, dot_len = 1; } + if (info_mode & FAN_REPORT_PIDFD) + info_len += FANOTIFY_PIDFD_INFO_HDR_LEN; + if (fh_len) info_len += fanotify_fid_info_len(fh_len, dot_len); @@ -176,7 +184,7 @@ static struct fanotify_event *get_one_event(struct fsnotify_group *group, size_t event_size = FAN_EVENT_METADATA_LEN; struct fanotify_event *event = NULL; struct fsnotify_event *fsn_event; - unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); + unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES); pr_debug("%s: group=%p count=%zd\n", __func__, group, count); @@ -186,8 +194,8 @@ static struct fanotify_event *get_one_event(struct fsnotify_group *group, goto out; event = FANOTIFY_E(fsn_event); - if (fid_mode) - event_size += fanotify_event_info_len(fid_mode, event); + if (info_mode) + event_size += fanotify_event_info_len(info_mode, event); if (event_size > count) { event = ERR_PTR(-EINVAL); @@ -308,9 +316,10 @@ static int process_access_response(struct fsnotify_group *group, return -ENOENT; } -static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, - int info_type, const char *name, size_t name_len, - char __user *buf, size_t count) +static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, + int info_type, const char *name, + size_t name_len, + char __user *buf, size_t count) { struct fanotify_event_info_fid info = { }; struct file_handle handle = { }; @@ -403,6 +412,117 @@ static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, return info_len; } +static int copy_pidfd_info_to_user(int pidfd, + char __user *buf, + size_t count) +{ + struct fanotify_event_info_pidfd info = { }; + size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN; + + if (WARN_ON_ONCE(info_len > count)) + return -EFAULT; + + info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD; + info.hdr.len = info_len; + info.pidfd = pidfd; + + if (copy_to_user(buf, &info, info_len)) + return -EFAULT; + + return info_len; +} + +static int copy_info_records_to_user(struct fanotify_event *event, + struct fanotify_info *info, + unsigned int info_mode, int pidfd, + char __user *buf, size_t count) +{ + int ret, total_bytes = 0, info_type = 0; + unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS; + unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; + + /* + * Event info records order is as follows: dir fid + name, child fid. + */ + if (fanotify_event_dir_fh_len(event)) { + info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME : + FAN_EVENT_INFO_TYPE_DFID; + ret = copy_fid_info_to_user(fanotify_event_fsid(event), + fanotify_info_dir_fh(info), + info_type, + fanotify_info_name(info), + info->name_len, buf, count); + if (ret < 0) + return ret; + + buf += ret; + count -= ret; + total_bytes += ret; + } + + if (fanotify_event_object_fh_len(event)) { + const char *dot = NULL; + int dot_len = 0; + + if (fid_mode == FAN_REPORT_FID || info_type) { + /* + * With only group flag FAN_REPORT_FID only type FID is + * reported. Second info record type is always FID. + */ + info_type = FAN_EVENT_INFO_TYPE_FID; + } else if ((fid_mode & FAN_REPORT_NAME) && + (event->mask & FAN_ONDIR)) { + /* + * With group flag FAN_REPORT_NAME, if name was not + * recorded in an event on a directory, report the name + * "." with info type DFID_NAME. + */ + info_type = FAN_EVENT_INFO_TYPE_DFID_NAME; + dot = "."; + dot_len = 1; + } else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) || + (event->mask & FAN_ONDIR)) { + /* + * With group flag FAN_REPORT_DIR_FID, a single info + * record has type DFID for directory entry modification + * event and for event on a directory. + */ + info_type = FAN_EVENT_INFO_TYPE_DFID; + } else { + /* + * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID, + * a single info record has type FID for event on a + * non-directory, when there is no directory to report. + * For example, on FAN_DELETE_SELF event. + */ + info_type = FAN_EVENT_INFO_TYPE_FID; + } + + ret = copy_fid_info_to_user(fanotify_event_fsid(event), + fanotify_event_object_fh(event), + info_type, dot, dot_len, + buf, count); + if (ret < 0) + return ret; + + buf += ret; + count -= ret; + total_bytes += ret; + } + + if (pidfd_mode) { + ret = copy_pidfd_info_to_user(pidfd, buf, count); + if (ret < 0) + return ret; + + buf += ret; + count -= ret; + total_bytes += ret; + } + + return total_bytes; +} + static ssize_t copy_event_to_user(struct fsnotify_group *group, struct fanotify_event *event, char __user *buf, size_t count) @@ -410,15 +530,15 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, struct fanotify_event_metadata metadata; struct path *path = fanotify_event_path(event); struct fanotify_info *info = fanotify_event_info(event); - unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); + unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES); + unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; struct file *f = NULL; - int ret, fd = FAN_NOFD; - int info_type = 0; + int ret, pidfd = FAN_NOPIDFD, fd = FAN_NOFD; pr_debug("%s: group=%p event=%p\n", __func__, group, event); metadata.event_len = FAN_EVENT_METADATA_LEN + - fanotify_event_info_len(fid_mode, event); + fanotify_event_info_len(info_mode, event); metadata.metadata_len = FAN_EVENT_METADATA_LEN; metadata.vers = FANOTIFY_METADATA_VERSION; metadata.reserved = 0; @@ -447,6 +567,33 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, } metadata.fd = fd; + if (pidfd_mode) { + /* + * Complain if the FAN_REPORT_PIDFD and FAN_REPORT_TID mutual + * exclusion is ever lifted. At the time of incoporating pidfd + * support within fanotify, the pidfd API only supported the + * creation of pidfds for thread-group leaders. + */ + WARN_ON_ONCE(FAN_GROUP_FLAG(group, FAN_REPORT_TID)); + + /* + * The PIDTYPE_TGID check for an event->pid is performed + * preemptively in an attempt to catch out cases where the event + * listener reads events after the event generating process has + * already terminated. Report FAN_NOPIDFD to the event listener + * in those cases, with all other pidfd creation errors being + * reported as FAN_EPIDFD. + */ + if (metadata.pid == 0 || + !pid_has_task(event->pid, PIDTYPE_TGID)) { + pidfd = FAN_NOPIDFD; + } else { + pidfd = pidfd_create(event->pid, 0); + if (pidfd < 0) + pidfd = FAN_EPIDFD; + } + } + ret = -EFAULT; /* * Sanity check copy size in case get_one_event() and @@ -467,67 +614,11 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, if (f) fd_install(fd, f); - /* Event info records order is: dir fid + name, child fid */ - if (fanotify_event_dir_fh_len(event)) { - info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME : - FAN_EVENT_INFO_TYPE_DFID; - ret = copy_info_to_user(fanotify_event_fsid(event), - fanotify_info_dir_fh(info), - info_type, fanotify_info_name(info), - info->name_len, buf, count); + if (info_mode) { + ret = copy_info_records_to_user(event, info, info_mode, pidfd, + buf, count); if (ret < 0) goto out_close_fd; - - buf += ret; - count -= ret; - } - - if (fanotify_event_object_fh_len(event)) { - const char *dot = NULL; - int dot_len = 0; - - if (fid_mode == FAN_REPORT_FID || info_type) { - /* - * With only group flag FAN_REPORT_FID only type FID is - * reported. Second info record type is always FID. - */ - info_type = FAN_EVENT_INFO_TYPE_FID; - } else if ((fid_mode & FAN_REPORT_NAME) && - (event->mask & FAN_ONDIR)) { - /* - * With group flag FAN_REPORT_NAME, if name was not - * recorded in an event on a directory, report the - * name "." with info type DFID_NAME. - */ - info_type = FAN_EVENT_INFO_TYPE_DFID_NAME; - dot = "."; - dot_len = 1; - } else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) || - (event->mask & FAN_ONDIR)) { - /* - * With group flag FAN_REPORT_DIR_FID, a single info - * record has type DFID for directory entry modification - * event and for event on a directory. - */ - info_type = FAN_EVENT_INFO_TYPE_DFID; - } else { - /* - * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID, - * a single info record has type FID for event on a - * non-directory, when there is no directory to report. - * For example, on FAN_DELETE_SELF event. - */ - info_type = FAN_EVENT_INFO_TYPE_FID; - } - - ret = copy_info_to_user(fanotify_event_fsid(event), - fanotify_event_object_fh(event), - info_type, dot, dot_len, buf, count); - if (ret < 0) - goto out_close_fd; - - buf += ret; - count -= ret; } return metadata.event_len; @@ -537,6 +628,10 @@ out_close_fd: put_unused_fd(fd); fput(f); } + + if (pidfd >= 0) + close_fd(pidfd); + return ret; } @@ -1082,6 +1177,14 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) #endif return -EINVAL; + /* + * A pidfd can only be returned for a thread-group leader; thus + * FAN_REPORT_PIDFD and FAN_REPORT_TID need to remain mutually + * exclusive. + */ + if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID)) + return -EINVAL; + if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS) return -EINVAL; @@ -1483,7 +1586,7 @@ static int __init fanotify_user_setup(void) FANOTIFY_DEFAULT_MAX_USER_MARKS); BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS); - BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10); + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 11); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9); fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 30d422b8c0fc..963e6ce75b96 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -87,15 +87,15 @@ static void fsnotify_unmount_inodes(struct super_block *sb) if (iput_inode) iput(iput_inode); - /* Wait for outstanding inode references from connectors */ - wait_var_event(&sb->s_fsnotify_inode_refs, - !atomic_long_read(&sb->s_fsnotify_inode_refs)); } void fsnotify_sb_delete(struct super_block *sb) { fsnotify_unmount_inodes(sb); fsnotify_clear_marks_by_sb(sb); + /* Wait for outstanding object references from connectors */ + wait_var_event(&sb->s_fsnotify_connectors, + !atomic_long_read(&sb->s_fsnotify_connectors)); } /* diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index ff2063ec6b0f..87d8a50ee803 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -27,6 +27,21 @@ static inline struct super_block *fsnotify_conn_sb( return container_of(conn->obj, struct super_block, s_fsnotify_marks); } +static inline struct super_block *fsnotify_connector_sb( + struct fsnotify_mark_connector *conn) +{ + switch (conn->type) { + case FSNOTIFY_OBJ_TYPE_INODE: + return fsnotify_conn_inode(conn)->i_sb; + case FSNOTIFY_OBJ_TYPE_VFSMOUNT: + return fsnotify_conn_mount(conn)->mnt.mnt_sb; + case FSNOTIFY_OBJ_TYPE_SB: + return fsnotify_conn_sb(conn); + default: + return NULL; + } +} + /* destroy all events sitting in this groups notification queue */ extern void fsnotify_flush_notify(struct fsnotify_group *group); diff --git a/fs/notify/mark.c b/fs/notify/mark.c index d32ab349db74..95006d1d29ab 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -169,6 +169,37 @@ static void fsnotify_connector_destroy_workfn(struct work_struct *work) } } +static void fsnotify_get_inode_ref(struct inode *inode) +{ + ihold(inode); + atomic_long_inc(&inode->i_sb->s_fsnotify_connectors); +} + +static void fsnotify_put_inode_ref(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + iput(inode); + if (atomic_long_dec_and_test(&sb->s_fsnotify_connectors)) + wake_up_var(&sb->s_fsnotify_connectors); +} + +static void fsnotify_get_sb_connectors(struct fsnotify_mark_connector *conn) +{ + struct super_block *sb = fsnotify_connector_sb(conn); + + if (sb) + atomic_long_inc(&sb->s_fsnotify_connectors); +} + +static void fsnotify_put_sb_connectors(struct fsnotify_mark_connector *conn) +{ + struct super_block *sb = fsnotify_connector_sb(conn); + + if (sb && atomic_long_dec_and_test(&sb->s_fsnotify_connectors)) + wake_up_var(&sb->s_fsnotify_connectors); +} + static void *fsnotify_detach_connector_from_object( struct fsnotify_mark_connector *conn, unsigned int *type) @@ -182,13 +213,13 @@ static void *fsnotify_detach_connector_from_object( if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) { inode = fsnotify_conn_inode(conn); inode->i_fsnotify_mask = 0; - atomic_long_inc(&inode->i_sb->s_fsnotify_inode_refs); } else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) { fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0; } else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) { fsnotify_conn_sb(conn)->s_fsnotify_mask = 0; } + fsnotify_put_sb_connectors(conn); rcu_assign_pointer(*(conn->obj), NULL); conn->obj = NULL; conn->type = FSNOTIFY_OBJ_TYPE_DETACHED; @@ -209,19 +240,12 @@ static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark) /* Drop object reference originally held by a connector */ static void fsnotify_drop_object(unsigned int type, void *objp) { - struct inode *inode; - struct super_block *sb; - if (!objp) return; /* Currently only inode references are passed to be dropped */ if (WARN_ON_ONCE(type != FSNOTIFY_OBJ_TYPE_INODE)) return; - inode = objp; - sb = inode->i_sb; - iput(inode); - if (atomic_long_dec_and_test(&sb->s_fsnotify_inode_refs)) - wake_up_var(&sb->s_fsnotify_inode_refs); + fsnotify_put_inode_ref(objp); } void fsnotify_put_mark(struct fsnotify_mark *mark) @@ -493,8 +517,12 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, conn->fsid.val[0] = conn->fsid.val[1] = 0; conn->flags = 0; } - if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) - inode = igrab(fsnotify_conn_inode(conn)); + if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) { + inode = fsnotify_conn_inode(conn); + fsnotify_get_inode_ref(inode); + } + fsnotify_get_sb_connectors(conn); + /* * cmpxchg() provides the barrier so that readers of *connp can see * only initialized structure @@ -502,7 +530,7 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, if (cmpxchg(connp, NULL, conn)) { /* Someone else created list structure for us */ if (inode) - iput(inode); + fsnotify_put_inode_ref(inode); kmem_cache_free(fsnotify_mark_connector_cachep, conn); } diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c index fab7c6a4a7d0..73a3854b2afb 100644 --- a/fs/ocfs2/locks.c +++ b/fs/ocfs2/locks.c @@ -101,8 +101,6 @@ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_FLOCK)) return -ENOLCK; - if (__mandatory_lock(inode)) - return -ENOLCK; if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) || ocfs2_mount_local(osb)) @@ -121,8 +119,6 @@ int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_POSIX)) return -ENOLCK; - if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) - return -ENOLCK; return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl); } diff --git a/fs/open.c b/fs/open.c index 94bef26ff1b6..daa324606a41 100644 --- a/fs/open.c +++ b/fs/open.c @@ -105,9 +105,7 @@ long vfs_truncate(const struct path *path, loff_t length) if (error) goto put_write_and_out; - error = locks_verify_truncate(inode, NULL, length); - if (!error) - error = security_path_truncate(path); + error = security_path_truncate(path); if (!error) error = do_truncate(mnt_userns, path->dentry, length, 0, NULL); @@ -189,9 +187,7 @@ long do_sys_ftruncate(unsigned int fd, loff_t length, int small) if (IS_APPEND(file_inode(f.file))) goto out_putf; sb_start_write(inode->i_sb); - error = locks_verify_truncate(inode, f.file, length); - if (!error) - error = security_path_truncate(&f.file->f_path); + error = security_path_truncate(&f.file->f_path); if (!error) error = do_truncate(file_mnt_user_ns(f.file), dentry, length, ATTR_MTIME | ATTR_CTIME, f.file); diff --git a/fs/pipe.c b/fs/pipe.c index 678dee2a8228..6d4342bad9f1 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -363,10 +363,9 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) * _very_ unlikely case that the pipe was full, but we got * no data. */ - if (unlikely(was_full)) { + if (unlikely(was_full)) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); - kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); - } + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); /* * But because we didn't read anything, at this point we can @@ -385,12 +384,11 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) wake_next_reader = false; __pipe_unlock(pipe); - if (was_full) { + if (was_full) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); - kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); - } if (wake_next_reader) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); if (ret > 0) file_accessed(filp); return ret; @@ -565,10 +563,9 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) * become empty while we dropped the lock. */ __pipe_unlock(pipe); - if (was_empty) { + if (was_empty) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); - kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); - } + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe)); __pipe_lock(pipe); was_empty = pipe_empty(pipe->head, pipe->tail); @@ -591,10 +588,9 @@ out: * Epoll nonsensically wants a wakeup whether the pipe * was already empty or not. */ - if (was_empty || pipe->poll_usage) { + if (was_empty || pipe->poll_usage) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); - kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); - } + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); if (wake_next_writer) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { diff --git a/fs/read_write.c b/fs/read_write.c index 9db7adf160d2..af057c57bdc6 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -365,12 +365,8 @@ out_putf: int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) { - struct inode *inode; - int retval = -EINVAL; - - inode = file_inode(file); if (unlikely((ssize_t) count < 0)) - return retval; + return -EINVAL; /* * ranged mandatory locking does not apply to streams - it makes sense @@ -381,19 +377,12 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t if (unlikely(pos < 0)) { if (!unsigned_offsets(file)) - return retval; + return -EINVAL; if (count >= -pos) /* both values are in 0..LLONG_MAX */ return -EOVERFLOW; } else if (unlikely((loff_t) (pos + count) < 0)) { if (!unsigned_offsets(file)) - return retval; - } - - if (unlikely(inode->i_flctx && mandatory_lock(inode))) { - retval = locks_mandatory_area(inode, file, pos, pos + count - 1, - read_write == READ ? F_RDLCK : F_WRLCK); - if (retval < 0) - return retval; + return -EINVAL; } } diff --git a/fs/remap_range.c b/fs/remap_range.c index e4a5fdd7ad7b..6d4a9beaa097 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -99,24 +99,12 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in, static int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write) { - struct inode *inode = file_inode(file); - if (unlikely(pos < 0 || len < 0)) return -EINVAL; if (unlikely((loff_t) (pos + len) < 0)) return -EINVAL; - if (unlikely(inode->i_flctx && mandatory_lock(inode))) { - loff_t end = len ? pos + len - 1 : OFFSET_MAX; - int retval; - - retval = locks_mandatory_area(inode, file, pos, end, - write ? F_WRLCK : F_RDLCK); - if (retval < 0) - return retval; - } - return security_file_permission(file, write ? MAY_WRITE : MAY_READ); } diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 855f0e87066d..2db8bcf7ff85 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -49,8 +49,7 @@ static int copy_bio_to_actor(struct bio *bio, bytes_to_copy = min_t(int, bytes_to_copy, req_length - copied_bytes); - memcpy(actor_addr + actor_offset, - page_address(bvec->bv_page) + bvec->bv_offset + offset, + memcpy(actor_addr + actor_offset, bvec_virt(bvec) + offset, bytes_to_copy); actor_offset += bytes_to_copy; @@ -177,7 +176,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, goto out_free_bio; } /* Extract the length of the metadata block */ - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); length = data[offset]; if (offset < bvec->bv_len - 1) { length |= data[offset + 1] << 8; @@ -186,7 +185,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, res = -EIO; goto out_free_bio; } - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); length |= data[0] << 8; } bio_free_pages(bio); diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c index 233d5582fbee..b685b6238316 100644 --- a/fs/squashfs/lz4_wrapper.c +++ b/fs/squashfs/lz4_wrapper.c @@ -101,7 +101,7 @@ static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm, while (bio_next_segment(bio, &iter_all)) { int avail = min(bytes, ((int)bvec->bv_len) - offset); - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); memcpy(buff, data + offset, avail); buff += avail; bytes -= avail; diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c index 97bb7d92ddcd..cb510a631968 100644 --- a/fs/squashfs/lzo_wrapper.c +++ b/fs/squashfs/lzo_wrapper.c @@ -76,7 +76,7 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm, while (bio_next_segment(bio, &iter_all)) { int avail = min(bytes, ((int)bvec->bv_len) - offset); - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); memcpy(buff, data + offset, avail); buff += avail; bytes -= avail; diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c index e80419aed862..68f6d09bb3a2 100644 --- a/fs/squashfs/xz_wrapper.c +++ b/fs/squashfs/xz_wrapper.c @@ -146,7 +146,7 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, } avail = min(length, ((int)bvec->bv_len) - offset); - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); length -= avail; stream->buf.in = data + offset; stream->buf.in_size = avail; diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c index bcb881ec47f2..a20e9042146b 100644 --- a/fs/squashfs/zlib_wrapper.c +++ b/fs/squashfs/zlib_wrapper.c @@ -76,7 +76,7 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, } avail = min(length, ((int)bvec->bv_len) - offset); - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); length -= avail; stream->next_in = data + offset; stream->avail_in = avail; diff --git a/fs/squashfs/zstd_wrapper.c b/fs/squashfs/zstd_wrapper.c index b7cb1faa652d..0015cf8b5582 100644 --- a/fs/squashfs/zstd_wrapper.c +++ b/fs/squashfs/zstd_wrapper.c @@ -94,7 +94,7 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, } avail = min(length, ((int)bvec->bv_len) - offset); - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); length -= avail; in_buf.src = data + offset; in_buf.size = avail; diff --git a/fs/super.c b/fs/super.c index 91b7f156735b..bcef3a6f4c4b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1203,7 +1203,7 @@ static int set_bdev_super(struct super_block *s, void *data) { s->s_bdev = data; s->s_dev = s->s_bdev->bd_dev; - s->s_bdi = bdi_get(s->s_bdev->bd_bdi); + s->s_bdi = bdi_get(s->s_bdev->bd_disk->bdi); if (blk_queue_stable_writes(s->s_bdev->bd_disk->queue)) s->s_iflags |= SB_I_STABLE_WRITES; diff --git a/fs/timerfd.c b/fs/timerfd.c index c5509d2448e3..e9c96a0c79f1 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -115,6 +115,22 @@ void timerfd_clock_was_set(void) rcu_read_unlock(); } +static void timerfd_resume_work(struct work_struct *work) +{ + timerfd_clock_was_set(); +} + +static DECLARE_WORK(timerfd_work, timerfd_resume_work); + +/* + * Invoked from timekeeping_resume(). Defer the actual update to work so + * timerfd_clock_was_set() runs in task context. + */ +void timerfd_resume(void) +{ + schedule_work(&timerfd_work); +} + static void __timerfd_remove_cancel(struct timerfd_ctx *ctx) { if (ctx->might_cancel) { diff --git a/fs/udf/dir.c b/fs/udf/dir.c index c19dba45aa20..70abdfad2df1 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -35,7 +35,6 @@ #include "udf_i.h" #include "udf_sb.h" - static int udf_readdir(struct file *file, struct dir_context *ctx) { struct inode *dir = file_inode(file); @@ -135,7 +134,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) lfi = cfi.lengthFileIdent; if (fibh.sbh == fibh.ebh) { - nameptr = fi->fileIdent + liu; + nameptr = udf_get_fi_ident(fi); } else { int poffset; /* Unpaded ending offset */ @@ -153,7 +152,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) } } nameptr = copy_name; - memcpy(nameptr, fi->fileIdent + liu, + memcpy(nameptr, udf_get_fi_ident(fi), lfi - poffset); memcpy(nameptr + lfi - poffset, fibh.ebh->b_data, poffset); diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h index 185c3e247648..de17a97e8667 100644 --- a/fs/udf/ecma_167.h +++ b/fs/udf/ecma_167.h @@ -307,14 +307,14 @@ struct logicalVolDesc { struct regid impIdent; uint8_t impUse[128]; struct extent_ad integritySeqExt; - uint8_t partitionMaps[0]; + uint8_t partitionMaps[]; } __packed; /* Generic Partition Map (ECMA 167r3 3/10.7.1) */ struct genericPartitionMap { uint8_t partitionMapType; uint8_t partitionMapLength; - uint8_t partitionMapping[0]; + uint8_t partitionMapping[]; } __packed; /* Partition Map Type (ECMA 167r3 3/10.7.1.1) */ @@ -342,7 +342,7 @@ struct unallocSpaceDesc { struct tag descTag; __le32 volDescSeqNum; __le32 numAllocDescs; - struct extent_ad allocDescs[0]; + struct extent_ad allocDescs[]; } __packed; /* Terminating Descriptor (ECMA 167r3 3/10.9) */ @@ -360,9 +360,9 @@ struct logicalVolIntegrityDesc { uint8_t logicalVolContentsUse[32]; __le32 numOfPartitions; __le32 lengthOfImpUse; - __le32 freeSpaceTable[0]; - __le32 sizeTable[0]; - uint8_t impUse[0]; + __le32 freeSpaceTable[]; + /* __le32 sizeTable[]; */ + /* uint8_t impUse[]; */ } __packed; /* Integrity Type (ECMA 167r3 3/10.10.3) */ @@ -471,9 +471,9 @@ struct fileIdentDesc { uint8_t lengthFileIdent; struct long_ad icb; __le16 lengthOfImpUse; - uint8_t impUse[0]; - uint8_t fileIdent[0]; - uint8_t padding[0]; + uint8_t impUse[]; + /* uint8_t fileIdent[]; */ + /* uint8_t padding[]; */ } __packed; /* File Characteristics (ECMA 167r3 4/14.4.3) */ @@ -578,8 +578,8 @@ struct fileEntry { __le64 uniqueID; __le32 lengthExtendedAttr; __le32 lengthAllocDescs; - uint8_t extendedAttr[0]; - uint8_t allocDescs[0]; + uint8_t extendedAttr[]; + /* uint8_t allocDescs[]; */ } __packed; /* Permissions (ECMA 167r3 4/14.9.5) */ @@ -632,7 +632,7 @@ struct genericFormat { uint8_t attrSubtype; uint8_t reserved[3]; __le32 attrLength; - uint8_t attrData[0]; + uint8_t attrData[]; } __packed; /* Character Set Information (ECMA 167r3 4/14.10.3) */ @@ -643,7 +643,7 @@ struct charSetInfo { __le32 attrLength; __le32 escapeSeqLength; uint8_t charSetType; - uint8_t escapeSeq[0]; + uint8_t escapeSeq[]; } __packed; /* Alternate Permissions (ECMA 167r3 4/14.10.4) */ @@ -682,7 +682,7 @@ struct infoTimesExtAttr { __le32 attrLength; __le32 dataLength; __le32 infoTimeExistence; - uint8_t infoTimes[0]; + uint8_t infoTimes[]; } __packed; /* Device Specification (ECMA 167r3 4/14.10.7) */ @@ -694,7 +694,7 @@ struct deviceSpec { __le32 impUseLength; __le32 majorDeviceIdent; __le32 minorDeviceIdent; - uint8_t impUse[0]; + uint8_t impUse[]; } __packed; /* Implementation Use Extended Attr (ECMA 167r3 4/14.10.8) */ @@ -705,7 +705,7 @@ struct impUseExtAttr { __le32 attrLength; __le32 impUseLength; struct regid impIdent; - uint8_t impUse[0]; + uint8_t impUse[]; } __packed; /* Application Use Extended Attribute (ECMA 167r3 4/14.10.9) */ @@ -716,7 +716,7 @@ struct appUseExtAttr { __le32 attrLength; __le32 appUseLength; struct regid appIdent; - uint8_t appUse[0]; + uint8_t appUse[]; } __packed; #define EXTATTR_CHAR_SET 1 @@ -733,7 +733,7 @@ struct unallocSpaceEntry { struct tag descTag; struct icbtag icbTag; __le32 lengthAllocDescs; - uint8_t allocDescs[0]; + uint8_t allocDescs[]; } __packed; /* Space Bitmap Descriptor (ECMA 167r3 4/14.12) */ @@ -741,7 +741,7 @@ struct spaceBitmapDesc { struct tag descTag; __le32 numOfBits; __le32 numOfBytes; - uint8_t bitmap[0]; + uint8_t bitmap[]; } __packed; /* Partition Integrity Entry (ECMA 167r3 4/14.13) */ @@ -780,7 +780,7 @@ struct pathComponent { uint8_t componentType; uint8_t lengthComponentIdent; __le16 componentFileVersionNum; - dchars componentIdent[0]; + dchars componentIdent[]; } __packed; /* File Entry (ECMA 167r3 4/14.17) */ @@ -809,8 +809,8 @@ struct extendedFileEntry { __le64 uniqueID; __le32 lengthExtendedAttr; __le32 lengthAllocDescs; - uint8_t extendedAttr[0]; - uint8_t allocDescs[0]; + uint8_t extendedAttr[]; + /* uint8_t allocDescs[]; */ } __packed; #endif /* _ECMA_167_H */ diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 4917670860a0..1d6b7a50736b 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -390,8 +390,7 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, dfibh.eoffset += (sfibh.eoffset - sfibh.soffset); dfi = (struct fileIdentDesc *)(dbh->b_data + dfibh.soffset); if (udf_write_fi(inode, sfi, dfi, &dfibh, sfi->impUse, - sfi->fileIdent + - le16_to_cpu(sfi->lengthOfImpUse))) { + udf_get_fi_ident(sfi))) { iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; brelse(dbh); return NULL; diff --git a/fs/udf/misc.c b/fs/udf/misc.c index eab94527340d..1614d308d0f0 100644 --- a/fs/udf/misc.c +++ b/fs/udf/misc.c @@ -173,13 +173,22 @@ struct genericFormat *udf_get_extendedattr(struct inode *inode, uint32_t type, else offset = le32_to_cpu(eahd->appAttrLocation); - while (offset < iinfo->i_lenEAttr) { + while (offset + sizeof(*gaf) < iinfo->i_lenEAttr) { + uint32_t attrLength; + gaf = (struct genericFormat *)&ea[offset]; + attrLength = le32_to_cpu(gaf->attrLength); + + /* Detect undersized elements and buffer overflows */ + if ((attrLength < sizeof(*gaf)) || + (attrLength > (iinfo->i_lenEAttr - offset))) + break; + if (le32_to_cpu(gaf->attrType) == type && gaf->attrSubtype == subtype) return gaf; else - offset += le32_to_cpu(gaf->attrLength); + offset += attrLength; } } diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 7c7c9bbbfa57..caeef08efed2 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -74,12 +74,11 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, if (fileident) { if (adinicb || (offset + lfi < 0)) { - memcpy((uint8_t *)sfi->fileIdent + liu, fileident, lfi); + memcpy(udf_get_fi_ident(sfi), fileident, lfi); } else if (offset >= 0) { memcpy(fibh->ebh->b_data + offset, fileident, lfi); } else { - memcpy((uint8_t *)sfi->fileIdent + liu, fileident, - -offset); + memcpy(udf_get_fi_ident(sfi), fileident, -offset); memcpy(fibh->ebh->b_data, fileident - offset, lfi + offset); } @@ -88,11 +87,11 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, offset += lfi; if (adinicb || (offset + padlen < 0)) { - memset((uint8_t *)sfi->padding + liu + lfi, 0x00, padlen); + memset(udf_get_fi_ident(sfi) + lfi, 0x00, padlen); } else if (offset >= 0) { memset(fibh->ebh->b_data + offset, 0x00, padlen); } else { - memset((uint8_t *)sfi->padding + liu + lfi, 0x00, -offset); + memset(udf_get_fi_ident(sfi) + lfi, 0x00, -offset); memset(fibh->ebh->b_data, 0x00, padlen + offset); } @@ -226,7 +225,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, lfi = cfi->lengthFileIdent; if (fibh->sbh == fibh->ebh) { - nameptr = fi->fileIdent + liu; + nameptr = udf_get_fi_ident(fi); } else { int poffset; /* Unpaded ending offset */ @@ -246,7 +245,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, } } nameptr = copy_name; - memcpy(nameptr, fi->fileIdent + liu, + memcpy(nameptr, udf_get_fi_ident(fi), lfi - poffset); memcpy(nameptr + lfi - poffset, fibh->ebh->b_data, poffset); diff --git a/fs/udf/osta_udf.h b/fs/udf/osta_udf.h index 22bc4fb2feb9..157de0ec0cd5 100644 --- a/fs/udf/osta_udf.h +++ b/fs/udf/osta_udf.h @@ -111,7 +111,7 @@ struct logicalVolIntegrityDescImpUse { __le16 minUDFReadRev; __le16 minUDFWriteRev; __le16 maxUDFWriteRev; - uint8_t impUse[0]; + uint8_t impUse[]; } __packed; /* Implementation Use Volume Descriptor (UDF 2.60 2.2.7) */ @@ -178,15 +178,6 @@ struct metadataPartitionMap { uint8_t reserved2[5]; } __packed; -/* Virtual Allocation Table (UDF 1.5 2.2.10) */ -struct virtualAllocationTable15 { - __le32 vatEntry[0]; - struct regid vatIdent; - __le32 previousVATICBLoc; -} __packed; - -#define ICBTAG_FILE_TYPE_VAT15 0x00U - /* Virtual Allocation Table (UDF 2.60 2.2.11) */ struct virtualAllocationTable20 { __le16 lengthHeader; @@ -199,8 +190,8 @@ struct virtualAllocationTable20 { __le16 minUDFWriteRev; __le16 maxUDFWriteRev; __le16 reserved; - uint8_t impUse[0]; - __le32 vatEntry[0]; + uint8_t impUse[]; + /* __le32 vatEntry[]; */ } __packed; #define ICBTAG_FILE_TYPE_VAT20 0xF8U @@ -217,8 +208,7 @@ struct sparingTable { __le16 reallocationTableLen; __le16 reserved; __le32 sequenceNum; - struct sparingEntry - mapEntry[0]; + struct sparingEntry mapEntry[]; } __packed; /* Metadata File (and Metadata Mirror File) (UDF 2.60 2.2.13.1) */ @@ -241,7 +231,7 @@ struct allocDescImpUse { /* FreeEASpace (UDF 2.60 3.3.4.5.1.1) */ struct freeEaSpace { __le16 headerChecksum; - uint8_t freeEASpace[0]; + uint8_t freeEASpace[]; } __packed; /* DVD Copyright Management Information (UDF 2.60 3.3.4.5.1.2) */ @@ -265,7 +255,7 @@ struct LVExtensionEA { /* FreeAppEASpace (UDF 2.60 3.3.4.6.1) */ struct freeAppEASpace { __le16 headerChecksum; - uint8_t freeEASpace[0]; + uint8_t freeEASpace[]; } __packed; /* UDF Defined System Stream (UDF 2.60 3.3.7) */ diff --git a/fs/udf/super.c b/fs/udf/super.c index 2f83c1204e20..b2d7c57d0688 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -108,16 +108,10 @@ struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct super_block *sb) return NULL; lvid = (struct logicalVolIntegrityDesc *)UDF_SB(sb)->s_lvid_bh->b_data; partnum = le32_to_cpu(lvid->numOfPartitions); - if ((sb->s_blocksize - sizeof(struct logicalVolIntegrityDescImpUse) - - offsetof(struct logicalVolIntegrityDesc, impUse)) / - (2 * sizeof(uint32_t)) < partnum) { - udf_err(sb, "Logical volume integrity descriptor corrupted " - "(numOfPartitions = %u)!\n", partnum); - return NULL; - } /* The offset is to skip freeSpaceTable and sizeTable arrays */ offset = partnum * 2 * sizeof(uint32_t); - return (struct logicalVolIntegrityDescImpUse *)&(lvid->impUse[offset]); + return (struct logicalVolIntegrityDescImpUse *) + (((uint8_t *)(lvid + 1)) + offset); } /* UDF filesystem type */ @@ -349,10 +343,10 @@ static int udf_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",lastblock=%u", sbi->s_last_block); if (sbi->s_anchor != 0) seq_printf(seq, ",anchor=%u", sbi->s_anchor); - if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) - seq_puts(seq, ",utf8"); - if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP) && sbi->s_nls_map) + if (sbi->s_nls_map) seq_printf(seq, ",iocharset=%s", sbi->s_nls_map->charset); + else + seq_puts(seq, ",iocharset=utf8"); return 0; } @@ -558,19 +552,24 @@ static int udf_parse_options(char *options, struct udf_options *uopt, /* Ignored (never implemented properly) */ break; case Opt_utf8: - uopt->flags |= (1 << UDF_FLAG_UTF8); + if (!remount) { + unload_nls(uopt->nls_map); + uopt->nls_map = NULL; + } break; case Opt_iocharset: if (!remount) { - if (uopt->nls_map) - unload_nls(uopt->nls_map); - /* - * load_nls() failure is handled later in - * udf_fill_super() after all options are - * parsed. - */ + unload_nls(uopt->nls_map); + uopt->nls_map = NULL; + } + /* When nls_map is not loaded then UTF-8 is used */ + if (!remount && strcmp(args[0].from, "utf8") != 0) { uopt->nls_map = load_nls(args[0].from); - uopt->flags |= (1 << UDF_FLAG_NLS_MAP); + if (!uopt->nls_map) { + pr_err("iocharset %s not found\n", + args[0].from); + return 0; + } } break; case Opt_uforget: @@ -1542,6 +1541,7 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ struct udf_sb_info *sbi = UDF_SB(sb); struct logicalVolIntegrityDesc *lvid; int indirections = 0; + u32 parts, impuselen; while (++indirections <= UDF_MAX_LVID_NESTING) { final_bh = NULL; @@ -1568,15 +1568,27 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ lvid = (struct logicalVolIntegrityDesc *)final_bh->b_data; if (lvid->nextIntegrityExt.extLength == 0) - return; + goto check; loc = leea_to_cpu(lvid->nextIntegrityExt); } udf_warn(sb, "Too many LVID indirections (max %u), ignoring.\n", UDF_MAX_LVID_NESTING); +out_err: brelse(sbi->s_lvid_bh); sbi->s_lvid_bh = NULL; + return; +check: + parts = le32_to_cpu(lvid->numOfPartitions); + impuselen = le32_to_cpu(lvid->lengthOfImpUse); + if (parts >= sb->s_blocksize || impuselen >= sb->s_blocksize || + sizeof(struct logicalVolIntegrityDesc) + impuselen + + 2 * parts * sizeof(u32) > sb->s_blocksize) { + udf_warn(sb, "Corrupted LVID (parts=%u, impuselen=%u), " + "ignoring.\n", parts, impuselen); + goto out_err; + } } /* @@ -2139,21 +2151,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) if (!udf_parse_options((char *)options, &uopt, false)) goto parse_options_failure; - if (uopt.flags & (1 << UDF_FLAG_UTF8) && - uopt.flags & (1 << UDF_FLAG_NLS_MAP)) { - udf_err(sb, "utf8 cannot be combined with iocharset\n"); - goto parse_options_failure; - } - if ((uopt.flags & (1 << UDF_FLAG_NLS_MAP)) && !uopt.nls_map) { - uopt.nls_map = load_nls_default(); - if (!uopt.nls_map) - uopt.flags &= ~(1 << UDF_FLAG_NLS_MAP); - else - udf_debug("Using default NLS map\n"); - } - if (!(uopt.flags & (1 << UDF_FLAG_NLS_MAP))) - uopt.flags |= (1 << UDF_FLAG_UTF8); - fileset.logicalBlockNum = 0xFFFFFFFF; fileset.partitionReferenceNum = 0xFFFF; @@ -2308,8 +2305,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) error_out: iput(sbi->s_vat_inode); parse_options_failure: - if (uopt.nls_map) - unload_nls(uopt.nls_map); + unload_nls(uopt.nls_map); if (lvid_open) udf_close_lvid(sb); brelse(sbi->s_lvid_bh); @@ -2359,8 +2355,7 @@ static void udf_put_super(struct super_block *sb) sbi = UDF_SB(sb); iput(sbi->s_vat_inode); - if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) - unload_nls(sbi->s_nls_map); + unload_nls(sbi->s_nls_map); if (!sb_rdonly(sb)) udf_close_lvid(sb); brelse(sbi->s_lvid_bh); diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 758efe557a19..4fa620543d30 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -20,8 +20,6 @@ #define UDF_FLAG_UNDELETE 6 #define UDF_FLAG_UNHIDE 7 #define UDF_FLAG_VARCONV 8 -#define UDF_FLAG_NLS_MAP 9 -#define UDF_FLAG_UTF8 10 #define UDF_FLAG_UID_FORGET 11 /* save -1 for uid to disk */ #define UDF_FLAG_GID_FORGET 12 #define UDF_FLAG_UID_SET 13 diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 9dd0814f1077..7e258f15b8ef 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -130,6 +130,10 @@ static inline unsigned int udf_dir_entry_len(struct fileIdentDesc *cfi) le16_to_cpu(cfi->lengthOfImpUse) + cfi->lengthFileIdent, UDF_NAME_PAD); } +static inline uint8_t *udf_get_fi_ident(struct fileIdentDesc *fi) +{ + return ((uint8_t *)(fi + 1)) + le16_to_cpu(fi->lengthOfImpUse); +} /* file.c */ extern long udf_ioctl(struct file *, unsigned int, unsigned long); diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c index 5fcfa96463eb..622569007b53 100644 --- a/fs/udf/unicode.c +++ b/fs/udf/unicode.c @@ -177,7 +177,7 @@ static int udf_name_from_CS0(struct super_block *sb, return 0; } - if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) + if (UDF_SB(sb)->s_nls_map) conv_f = UDF_SB(sb)->s_nls_map->uni2char; else conv_f = NULL; @@ -285,7 +285,7 @@ static int udf_name_to_CS0(struct super_block *sb, if (ocu_max_len <= 0) return 0; - if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) + if (UDF_SB(sb)->s_nls_map) conv_f = UDF_SB(sb)->s_nls_map->char2uni; else conv_f = NULL; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 213a97a921bb..1cd3f940fa6a 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1626,7 +1626,6 @@ xfs_swap_extents( struct xfs_bstat *sbp = &sxp->sx_stat; int src_log_flags, target_log_flags; int error = 0; - int lock_flags; uint64_t f; int resblks = 0; unsigned int flags = 0; @@ -1638,8 +1637,8 @@ xfs_swap_extents( * do the rest of the checks. */ lock_two_nondirectories(VFS_I(ip), VFS_I(tip)); - lock_flags = XFS_MMAPLOCK_EXCL; - xfs_lock_two_inodes(ip, XFS_MMAPLOCK_EXCL, tip, XFS_MMAPLOCK_EXCL); + filemap_invalidate_lock_two(VFS_I(ip)->i_mapping, + VFS_I(tip)->i_mapping); /* Verify that both files have the same format */ if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) { @@ -1711,7 +1710,6 @@ xfs_swap_extents( * or cancel will unlock the inodes from this point onwards. */ xfs_lock_two_inodes(ip, XFS_ILOCK_EXCL, tip, XFS_ILOCK_EXCL); - lock_flags |= XFS_ILOCK_EXCL; xfs_trans_ijoin(tp, ip, 0); xfs_trans_ijoin(tp, tip, 0); @@ -1830,13 +1828,16 @@ xfs_swap_extents( trace_xfs_swap_extent_after(ip, 0); trace_xfs_swap_extent_after(tip, 1); +out_unlock_ilock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(tip, XFS_ILOCK_EXCL); out_unlock: - xfs_iunlock(ip, lock_flags); - xfs_iunlock(tip, lock_flags); + filemap_invalidate_unlock_two(VFS_I(ip)->i_mapping, + VFS_I(tip)->i_mapping); unlock_two_nondirectories(VFS_I(ip), VFS_I(tip)); return error; out_trans_cancel: xfs_trans_cancel(tp); - goto out_unlock; + goto out_unlock_ilock; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 8ff42b3585e0..3ab73567a0f5 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -844,7 +844,7 @@ xfs_buf_readahead_map( { struct xfs_buf *bp; - if (bdi_read_congested(target->bt_bdev->bd_bdi)) + if (bdi_read_congested(target->bt_bdev->bd_disk->bdi)) return; xfs_buf_read_map(target, map, nmaps, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index cc3cfb12df53..3dfbdcdb0d1c 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1302,7 +1302,7 @@ xfs_file_llseek( * * mmap_lock (MM) * sb_start_pagefault(vfs, freeze) - * i_mmaplock (XFS - truncate serialisation) + * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation) * page_lock (MM) * i_lock (XFS - extent map serialisation) */ @@ -1323,24 +1323,27 @@ __xfs_filemap_fault( file_update_time(vmf->vma->vm_file); } - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { pfn_t pfn; + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, (write_fault && !vmf->cow_page) ? &xfs_direct_write_iomap_ops : &xfs_read_iomap_ops); if (ret & VM_FAULT_NEEDDSYNC) ret = dax_finish_sync_fault(vmf, pe_size, pfn); + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); } else { - if (write_fault) + if (write_fault) { + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops); - else + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + } else { ret = filemap_fault(vmf); + } } - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (write_fault) sb_end_pagefault(inode->i_sb); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 990b72ae3635..f00145e1a976 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -132,7 +132,7 @@ xfs_ilock_attr_map_shared( /* * In addition to i_rwsem in the VFS inode, the xfs inode contains 2 - * multi-reader locks: i_mmap_lock and the i_lock. This routine allows + * multi-reader locks: invalidate_lock and the i_lock. This routine allows * various combinations of the locks to be obtained. * * The 3 locks should always be ordered so that the IO lock is obtained first, @@ -140,23 +140,23 @@ xfs_ilock_attr_map_shared( * * Basic locking order: * - * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock + * i_rwsem -> invalidate_lock -> page_lock -> i_ilock * * mmap_lock locking order: * * i_rwsem -> page lock -> mmap_lock - * mmap_lock -> i_mmap_lock -> page_lock + * mmap_lock -> invalidate_lock -> page_lock * * The difference in mmap_lock locking order mean that we cannot hold the - * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can - * fault in pages during copy in/out (for buffered IO) or require the mmap_lock - * in get_user_pages() to map the user pages into the kernel address space for - * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because - * page faults already hold the mmap_lock. + * invalidate_lock over syscall based read(2)/write(2) based IO. These IO paths + * can fault in pages during copy in/out (for buffered IO) or require the + * mmap_lock in get_user_pages() to map the user pages into the kernel address + * space for direct IO. Similarly the i_rwsem cannot be taken inside a page + * fault because page faults already hold the mmap_lock. * * Hence to serialise fully against both syscall and mmap based IO, we need to - * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both - * taken in places where we need to invalidate the page cache in a race + * take both the i_rwsem and the invalidate_lock. These locks should *only* be + * both taken in places where we need to invalidate the page cache in a race * free manner (e.g. truncate, hole punch and other extent manipulation * functions). */ @@ -188,10 +188,13 @@ xfs_ilock( XFS_IOLOCK_DEP(lock_flags)); } - if (lock_flags & XFS_MMAPLOCK_EXCL) - mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); - else if (lock_flags & XFS_MMAPLOCK_SHARED) - mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); + if (lock_flags & XFS_MMAPLOCK_EXCL) { + down_write_nested(&VFS_I(ip)->i_mapping->invalidate_lock, + XFS_MMAPLOCK_DEP(lock_flags)); + } else if (lock_flags & XFS_MMAPLOCK_SHARED) { + down_read_nested(&VFS_I(ip)->i_mapping->invalidate_lock, + XFS_MMAPLOCK_DEP(lock_flags)); + } if (lock_flags & XFS_ILOCK_EXCL) mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); @@ -240,10 +243,10 @@ xfs_ilock_nowait( } if (lock_flags & XFS_MMAPLOCK_EXCL) { - if (!mrtryupdate(&ip->i_mmaplock)) + if (!down_write_trylock(&VFS_I(ip)->i_mapping->invalidate_lock)) goto out_undo_iolock; } else if (lock_flags & XFS_MMAPLOCK_SHARED) { - if (!mrtryaccess(&ip->i_mmaplock)) + if (!down_read_trylock(&VFS_I(ip)->i_mapping->invalidate_lock)) goto out_undo_iolock; } @@ -258,9 +261,9 @@ xfs_ilock_nowait( out_undo_mmaplock: if (lock_flags & XFS_MMAPLOCK_EXCL) - mrunlock_excl(&ip->i_mmaplock); + up_write(&VFS_I(ip)->i_mapping->invalidate_lock); else if (lock_flags & XFS_MMAPLOCK_SHARED) - mrunlock_shared(&ip->i_mmaplock); + up_read(&VFS_I(ip)->i_mapping->invalidate_lock); out_undo_iolock: if (lock_flags & XFS_IOLOCK_EXCL) up_write(&VFS_I(ip)->i_rwsem); @@ -307,9 +310,9 @@ xfs_iunlock( up_read(&VFS_I(ip)->i_rwsem); if (lock_flags & XFS_MMAPLOCK_EXCL) - mrunlock_excl(&ip->i_mmaplock); + up_write(&VFS_I(ip)->i_mapping->invalidate_lock); else if (lock_flags & XFS_MMAPLOCK_SHARED) - mrunlock_shared(&ip->i_mmaplock); + up_read(&VFS_I(ip)->i_mapping->invalidate_lock); if (lock_flags & XFS_ILOCK_EXCL) mrunlock_excl(&ip->i_lock); @@ -335,7 +338,7 @@ xfs_ilock_demote( if (lock_flags & XFS_ILOCK_EXCL) mrdemote(&ip->i_lock); if (lock_flags & XFS_MMAPLOCK_EXCL) - mrdemote(&ip->i_mmaplock); + downgrade_write(&VFS_I(ip)->i_mapping->invalidate_lock); if (lock_flags & XFS_IOLOCK_EXCL) downgrade_write(&VFS_I(ip)->i_rwsem); @@ -343,9 +346,29 @@ xfs_ilock_demote( } #if defined(DEBUG) || defined(XFS_WARN) -int +static inline bool +__xfs_rwsem_islocked( + struct rw_semaphore *rwsem, + bool shared) +{ + if (!debug_locks) + return rwsem_is_locked(rwsem); + + if (!shared) + return lockdep_is_held_type(rwsem, 0); + + /* + * We are checking that the lock is held at least in shared + * mode but don't care that it might be held exclusively + * (i.e. shared | excl). Hence we check if the lock is held + * in any mode rather than an explicit shared mode. + */ + return lockdep_is_held_type(rwsem, -1); +} + +bool xfs_isilocked( - xfs_inode_t *ip, + struct xfs_inode *ip, uint lock_flags) { if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) { @@ -355,20 +378,17 @@ xfs_isilocked( } if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) { - if (!(lock_flags & XFS_MMAPLOCK_SHARED)) - return !!ip->i_mmaplock.mr_writer; - return rwsem_is_locked(&ip->i_mmaplock.mr_lock); + return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem, + (lock_flags & XFS_IOLOCK_SHARED)); } - if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { - if (!(lock_flags & XFS_IOLOCK_SHARED)) - return !debug_locks || - lockdep_is_held_type(&VFS_I(ip)->i_rwsem, 0); - return rwsem_is_locked(&VFS_I(ip)->i_rwsem); + if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) { + return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem, + (lock_flags & XFS_IOLOCK_SHARED)); } ASSERT(0); - return 0; + return false; } #endif @@ -532,12 +552,10 @@ again: } /* - * xfs_lock_two_inodes() can only be used to lock one type of lock at a time - - * the mmaplock or the ilock, but not more than one type at a time. If we lock - * more than one at a time, lockdep will report false positives saying we have - * violated locking orders. The iolock must be double-locked separately since - * we use i_rwsem for that. We now support taking one lock EXCL and the other - * SHARED. + * xfs_lock_two_inodes() can only be used to lock ilock. The iolock and + * mmaplock must be double-locked separately since we use i_rwsem and + * invalidate_lock for that. We now support taking one lock EXCL and the + * other SHARED. */ void xfs_lock_two_inodes( @@ -555,15 +573,8 @@ xfs_lock_two_inodes( ASSERT(hweight32(ip1_mode) == 1); ASSERT(!(ip0_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))); ASSERT(!(ip1_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))); - ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) || - !(ip0_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); - ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) || - !(ip1_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); - ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) || - !(ip0_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); - ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) || - !(ip1_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); - + ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); + ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); ASSERT(ip0->i_ino != ip1->i_ino); if (ip0->i_ino > ip1->i_ino) { @@ -3741,11 +3752,8 @@ xfs_ilock2_io_mmap( ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2)); if (ret) return ret; - if (ip1 == ip2) - xfs_ilock(ip1, XFS_MMAPLOCK_EXCL); - else - xfs_lock_two_inodes(ip1, XFS_MMAPLOCK_EXCL, - ip2, XFS_MMAPLOCK_EXCL); + filemap_invalidate_lock_two(VFS_I(ip1)->i_mapping, + VFS_I(ip2)->i_mapping); return 0; } @@ -3755,12 +3763,9 @@ xfs_iunlock2_io_mmap( struct xfs_inode *ip1, struct xfs_inode *ip2) { - bool same_inode = (ip1 == ip2); - - xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); - if (!same_inode) - xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); + filemap_invalidate_unlock_two(VFS_I(ip1)->i_mapping, + VFS_I(ip2)->i_mapping); inode_unlock(VFS_I(ip2)); - if (!same_inode) + if (ip1 != ip2) inode_unlock(VFS_I(ip1)); } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 4b6703dbffb8..e0ae905554e2 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -40,7 +40,6 @@ typedef struct xfs_inode { /* Transaction and locking information. */ struct xfs_inode_log_item *i_itemp; /* logging information */ mrlock_t i_lock; /* inode lock */ - mrlock_t i_mmaplock; /* inode mmap IO lock */ atomic_t i_pincount; /* inode pin count */ /* @@ -410,7 +409,7 @@ void xfs_ilock(xfs_inode_t *, uint); int xfs_ilock_nowait(xfs_inode_t *, uint); void xfs_iunlock(xfs_inode_t *, uint); void xfs_ilock_demote(xfs_inode_t *, uint); -int xfs_isilocked(xfs_inode_t *, uint); +bool xfs_isilocked(struct xfs_inode *, uint); uint xfs_ilock_data_map_shared(struct xfs_inode *); uint xfs_ilock_attr_map_shared(struct xfs_inode *); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 2c9e26a44546..102cbd606633 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -709,8 +709,6 @@ xfs_fs_inode_init_once( atomic_set(&ip->i_pincount, 0); spin_lock_init(&ip->i_flags_lock); - mrlock_init(&ip->i_mmaplock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, - "xfsino", ip->i_ino); mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, "xfsino", ip->i_ino); } diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 70055d486bf7..ddc346a9df9b 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -462,7 +462,7 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize) inode_dio_wait(inode); /* Serialize against page faults */ - down_write(&zi->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); /* Serialize against zonefs_iomap_begin() */ mutex_lock(&zi->i_truncate_mutex); @@ -500,7 +500,7 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize) unlock: mutex_unlock(&zi->i_truncate_mutex); - up_write(&zi->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); return ret; } @@ -575,18 +575,6 @@ static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, return ret; } -static vm_fault_t zonefs_filemap_fault(struct vm_fault *vmf) -{ - struct zonefs_inode_info *zi = ZONEFS_I(file_inode(vmf->vma->vm_file)); - vm_fault_t ret; - - down_read(&zi->i_mmap_sem); - ret = filemap_fault(vmf); - up_read(&zi->i_mmap_sem); - - return ret; -} - static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); @@ -607,16 +595,16 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) file_update_time(vmf->vma->vm_file); /* Serialize against truncates */ - down_read(&zi->i_mmap_sem); + filemap_invalidate_lock_shared(inode->i_mapping); ret = iomap_page_mkwrite(vmf, &zonefs_iomap_ops); - up_read(&zi->i_mmap_sem); + filemap_invalidate_unlock_shared(inode->i_mapping); sb_end_pagefault(inode->i_sb); return ret; } static const struct vm_operations_struct zonefs_file_vm_ops = { - .fault = zonefs_filemap_fault, + .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = zonefs_filemap_page_mkwrite, }; @@ -1155,7 +1143,6 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb) inode_init_once(&zi->i_vnode); mutex_init(&zi->i_truncate_mutex); - init_rwsem(&zi->i_mmap_sem); zi->i_wr_refcnt = 0; return &zi->i_vnode; diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h index 51141907097c..7b147907c328 100644 --- a/fs/zonefs/zonefs.h +++ b/fs/zonefs/zonefs.h @@ -70,12 +70,11 @@ struct zonefs_inode_info { * and changes to the inode private data, and in particular changes to * a sequential file size on completion of direct IO writes. * Serialization of mmap read IOs with truncate and syscall IO - * operations is done with i_mmap_sem in addition to i_truncate_mutex. - * Only zonefs_seq_file_truncate() takes both lock (i_mmap_sem first, - * i_truncate_mutex second). + * operations is done with invalidate_lock in addition to + * i_truncate_mutex. Only zonefs_seq_file_truncate() takes both lock + * (invalidate_lock first, i_truncate_mutex second). */ struct mutex i_truncate_mutex; - struct rw_semaphore i_mmap_sem; /* guarded by i_truncate_mutex */ unsigned int i_wr_refcnt; |