summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_file.c13
-rw-r--r--fs/Kconfig10
-rw-r--r--fs/afs/flock.c4
-rw-r--r--fs/aio.c2
-rw-r--r--fs/block_dev.c257
-rw-r--r--fs/btrfs/inode.c2
-rw-r--r--fs/ceph/addr.c9
-rw-r--r--fs/ceph/caps.c21
-rw-r--r--fs/ceph/file.c2
-rw-r--r--fs/ceph/locks.c3
-rw-r--r--fs/ceph/mds_client.c7
-rw-r--r--fs/ceph/mdsmap.c8
-rw-r--r--fs/ceph/snap.c3
-rw-r--r--fs/ceph/super.h3
-rw-r--r--fs/cifs/smb2ops.c2
-rw-r--r--fs/eventfd.c12
-rw-r--r--fs/ext2/Kconfig1
-rw-r--r--fs/ext2/ext2.h11
-rw-r--r--fs/ext2/file.c7
-rw-r--r--fs/ext2/inode.c27
-rw-r--r--fs/ext2/super.c3
-rw-r--r--fs/ext4/ext4.h10
-rw-r--r--fs/ext4/extents.c25
-rw-r--r--fs/ext4/file.c13
-rw-r--r--fs/ext4/inode.c47
-rw-r--r--fs/ext4/ioctl.c4
-rw-r--r--fs/ext4/super.c13
-rw-r--r--fs/ext4/truncate.h8
-rw-r--r--fs/f2fs/data.c8
-rw-r--r--fs/f2fs/f2fs.h1
-rw-r--r--fs/f2fs/file.c62
-rw-r--r--fs/f2fs/super.c1
-rw-r--r--fs/f2fs/sysfs.c2
-rw-r--r--fs/fat/fatent.c1
-rw-r--r--fs/fcntl.c18
-rw-r--r--fs/fuse/dax.c50
-rw-r--r--fs/fuse/dir.c11
-rw-r--r--fs/fuse/file.c10
-rw-r--r--fs/fuse/fuse_i.h7
-rw-r--r--fs/fuse/inode.c1
-rw-r--r--fs/gfs2/file.c3
-rw-r--r--fs/hpfs/Kconfig1
-rw-r--r--fs/hpfs/file.c51
-rw-r--r--fs/inode.c2
-rw-r--r--fs/io-wq.c136
-rw-r--r--fs/io-wq.h1
-rw-r--r--fs/io_uring.c755
-rw-r--r--fs/ioctl.c203
-rw-r--r--fs/isofs/inode.c27
-rw-r--r--fs/isofs/isofs.h1
-rw-r--r--fs/isofs/joliet.c4
-rw-r--r--fs/locks.c117
-rw-r--r--fs/namei.c4
-rw-r--r--fs/namespace.c29
-rw-r--r--fs/nfs/file.c4
-rw-r--r--fs/nfsd/nfs4state.c14
-rw-r--r--fs/nfsd/vfs.c23
-rw-r--r--fs/nilfs2/super.c2
-rw-r--r--fs/notify/fanotify/fanotify_user.c251
-rw-r--r--fs/notify/fsnotify.c6
-rw-r--r--fs/notify/fsnotify.h15
-rw-r--r--fs/notify/mark.c52
-rw-r--r--fs/ocfs2/locks.c4
-rw-r--r--fs/open.c8
-rw-r--r--fs/pipe.c20
-rw-r--r--fs/read_write.c17
-rw-r--r--fs/remap_range.c12
-rw-r--r--fs/squashfs/block.c7
-rw-r--r--fs/squashfs/lz4_wrapper.c2
-rw-r--r--fs/squashfs/lzo_wrapper.c2
-rw-r--r--fs/squashfs/xz_wrapper.c2
-rw-r--r--fs/squashfs/zlib_wrapper.c2
-rw-r--r--fs/squashfs/zstd_wrapper.c2
-rw-r--r--fs/super.c2
-rw-r--r--fs/timerfd.c16
-rw-r--r--fs/udf/dir.c5
-rw-r--r--fs/udf/ecma_167.h44
-rw-r--r--fs/udf/inode.c3
-rw-r--r--fs/udf/misc.c13
-rw-r--r--fs/udf/namei.c13
-rw-r--r--fs/udf/osta_udf.h22
-rw-r--r--fs/udf/super.c75
-rw-r--r--fs/udf/udf_sb.h2
-rw-r--r--fs/udf/udfdecl.h4
-rw-r--r--fs/udf/unicode.c4
-rw-r--r--fs/xfs/xfs_bmap_util.c15
-rw-r--r--fs/xfs/xfs_buf.c2
-rw-r--r--fs/xfs/xfs_file.c13
-rw-r--r--fs/xfs/xfs_inode.c121
-rw-r--r--fs/xfs/xfs_inode.h3
-rw-r--r--fs/xfs/xfs_super.c2
-rw-r--r--fs/zonefs/super.c23
-rw-r--r--fs/zonefs/zonefs.h7
93 files changed, 1375 insertions, 1492 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 59c32c9b799f..c4a2dc41beac 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -121,10 +121,6 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
- /* No mandatory locks */
- if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
- return -ENOLCK;
-
if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
filemap_write_and_wait(inode->i_mapping);
invalidate_mapping_pages(&inode->i_data, 0, -1);
@@ -312,10 +308,6 @@ static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl)
p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n",
filp, cmd, fl, filp);
- /* No mandatory locks */
- if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
- goto out_err;
-
if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
filemap_write_and_wait(inode->i_mapping);
invalidate_mapping_pages(&inode->i_data, 0, -1);
@@ -327,7 +319,6 @@ static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl)
ret = v9fs_file_getlock(filp, fl);
else
ret = -EINVAL;
-out_err:
return ret;
}
@@ -348,10 +339,6 @@ static int v9fs_file_flock_dotl(struct file *filp, int cmd,
p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n",
filp, cmd, fl, filp);
- /* No mandatory locks */
- if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
- goto out_err;
-
if (!(fl->fl_flags & FL_FLOCK))
goto out_err;
diff --git a/fs/Kconfig b/fs/Kconfig
index a7749c126b8e..949128bf86c9 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -101,16 +101,6 @@ config FILE_LOCKING
for filesystems like NFS and for the flock() system
call. Disabling this option saves about 11k.
-config MANDATORY_FILE_LOCKING
- bool "Enable Mandatory file locking"
- depends on FILE_LOCKING
- default y
- help
- This option enables files appropriately marked files on appropriely
- mounted filesystems to support mandatory locking.
-
- To the best of my knowledge this is dead code that no one cares about.
-
source "fs/crypto/Kconfig"
source "fs/verity/Kconfig"
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index cb3054c7843e..c4210a3964d8 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -772,10 +772,6 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl)
fl->fl_type, fl->fl_flags,
(long long) fl->fl_start, (long long) fl->fl_end);
- /* AFS doesn't support mandatory locks */
- if (__mandatory_lock(&vnode->vfs_inode) && fl->fl_type != F_UNLCK)
- return -ENOLCK;
-
if (IS_GETLK(cmd))
return afs_do_getlk(file, fl);
diff --git a/fs/aio.c b/fs/aio.c
index 76ce0cc3ee4e..51b08ab01dff 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1695,7 +1695,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
list_del(&iocb->ki_list);
iocb->ki_res.res = mangle_poll(mask);
req->done = true;
- if (iocb->ki_eventfd && eventfd_signal_count()) {
+ if (iocb->ki_eventfd && eventfd_signal_allowed()) {
iocb = NULL;
INIT_WORK(&req->work, aio_poll_put_work);
schedule_work(&req->work);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 3c7fb7106713..45df6cbccf12 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -35,6 +35,7 @@
#include <linux/uaccess.h>
#include <linux/suspend.h>
#include "internal.h"
+#include "../block/blk.h"
struct bdev_inode {
struct block_device bdev;
@@ -688,7 +689,8 @@ static loff_t block_llseek(struct file *file, loff_t offset, int whence)
return retval;
}
-int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
+static int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
+ int datasync)
{
struct inode *bd_inode = bdev_file_inode(filp);
struct block_device *bdev = I_BDEV(bd_inode);
@@ -709,7 +711,6 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
return error;
}
-EXPORT_SYMBOL(blkdev_fsync);
/**
* bdev_read_page() - Start reading a page from a block device
@@ -803,7 +804,6 @@ static struct inode *bdev_alloc_inode(struct super_block *sb)
if (!ei)
return NULL;
memset(&ei->bdev, 0, sizeof(ei->bdev));
- ei->bdev.bd_bdi = &noop_backing_dev_info;
return &ei->vfs_inode;
}
@@ -814,8 +814,15 @@ static void bdev_free_inode(struct inode *inode)
free_percpu(bdev->bd_stats);
kfree(bdev->bd_meta_info);
- if (!bdev_is_partition(bdev))
+ if (!bdev_is_partition(bdev)) {
+ if (bdev->bd_disk && bdev->bd_disk->bdi)
+ bdi_put(bdev->bd_disk->bdi);
kfree(bdev->bd_disk);
+ }
+
+ if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR)
+ blk_free_ext_minor(MINOR(bdev->bd_dev));
+
kmem_cache_free(bdev_cachep, BDEV_I(inode));
}
@@ -828,16 +835,9 @@ static void init_once(void *data)
static void bdev_evict_inode(struct inode *inode)
{
- struct block_device *bdev = &BDEV_I(inode)->bdev;
truncate_inode_pages_final(&inode->i_data);
invalidate_inode_buffers(inode); /* is it needed here? */
clear_inode(inode);
- /* Detach inode from wb early as bdi_put() may free bdi->wb */
- inode_detach_wb(inode);
- if (bdev->bd_bdi != &noop_backing_dev_info) {
- bdi_put(bdev->bd_bdi);
- bdev->bd_bdi = &noop_backing_dev_info;
- }
}
static const struct super_operations bdev_sops = {
@@ -904,9 +904,6 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
bdev->bd_disk = disk;
bdev->bd_partno = partno;
bdev->bd_inode = inode;
-#ifdef CONFIG_SYSFS
- INIT_LIST_HEAD(&bdev->bd_holder_disks);
-#endif
bdev->bd_stats = alloc_percpu(struct disk_stats);
if (!bdev->bd_stats) {
iput(inode);
@@ -923,31 +920,6 @@ void bdev_add(struct block_device *bdev, dev_t dev)
insert_inode_hash(bdev->bd_inode);
}
-static struct block_device *bdget(dev_t dev)
-{
- struct inode *inode;
-
- inode = ilookup(blockdev_superblock, dev);
- if (!inode)
- return NULL;
- return &BDEV_I(inode)->bdev;
-}
-
-/**
- * bdgrab -- Grab a reference to an already referenced block device
- * @bdev: Block device to grab a reference to.
- *
- * Returns the block_device with an additional reference when successful,
- * or NULL if the inode is already beeing freed.
- */
-struct block_device *bdgrab(struct block_device *bdev)
-{
- if (!igrab(bdev->bd_inode))
- return NULL;
- return bdev;
-}
-EXPORT_SYMBOL(bdgrab);
-
long nr_blockdev_pages(void)
{
struct inode *inode;
@@ -961,12 +933,6 @@ long nr_blockdev_pages(void)
return ret;
}
-void bdput(struct block_device *bdev)
-{
- iput(bdev->bd_inode);
-}
-EXPORT_SYMBOL(bdput);
-
/**
* bd_may_claim - test whether a block device can be claimed
* @bdev: block device of interest
@@ -1096,148 +1062,6 @@ void bd_abort_claiming(struct block_device *bdev, void *holder)
}
EXPORT_SYMBOL(bd_abort_claiming);
-#ifdef CONFIG_SYSFS
-struct bd_holder_disk {
- struct list_head list;
- struct gendisk *disk;
- int refcnt;
-};
-
-static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
- struct gendisk *disk)
-{
- struct bd_holder_disk *holder;
-
- list_for_each_entry(holder, &bdev->bd_holder_disks, list)
- if (holder->disk == disk)
- return holder;
- return NULL;
-}
-
-static int add_symlink(struct kobject *from, struct kobject *to)
-{
- return sysfs_create_link(from, to, kobject_name(to));
-}
-
-static void del_symlink(struct kobject *from, struct kobject *to)
-{
- sysfs_remove_link(from, kobject_name(to));
-}
-
-/**
- * bd_link_disk_holder - create symlinks between holding disk and slave bdev
- * @bdev: the claimed slave bdev
- * @disk: the holding disk
- *
- * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
- *
- * This functions creates the following sysfs symlinks.
- *
- * - from "slaves" directory of the holder @disk to the claimed @bdev
- * - from "holders" directory of the @bdev to the holder @disk
- *
- * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is
- * passed to bd_link_disk_holder(), then:
- *
- * /sys/block/dm-0/slaves/sda --> /sys/block/sda
- * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
- *
- * The caller must have claimed @bdev before calling this function and
- * ensure that both @bdev and @disk are valid during the creation and
- * lifetime of these symlinks.
- *
- * CONTEXT:
- * Might sleep.
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
-{
- struct bd_holder_disk *holder;
- int ret = 0;
-
- mutex_lock(&bdev->bd_disk->open_mutex);
-
- WARN_ON_ONCE(!bdev->bd_holder);
-
- /* FIXME: remove the following once add_disk() handles errors */
- if (WARN_ON(!disk->slave_dir || !bdev->bd_holder_dir))
- goto out_unlock;
-
- holder = bd_find_holder_disk(bdev, disk);
- if (holder) {
- holder->refcnt++;
- goto out_unlock;
- }
-
- holder = kzalloc(sizeof(*holder), GFP_KERNEL);
- if (!holder) {
- ret = -ENOMEM;
- goto out_unlock;
- }
-
- INIT_LIST_HEAD(&holder->list);
- holder->disk = disk;
- holder->refcnt = 1;
-
- ret = add_symlink(disk->slave_dir, bdev_kobj(bdev));
- if (ret)
- goto out_free;
-
- ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj);
- if (ret)
- goto out_del;
- /*
- * bdev could be deleted beneath us which would implicitly destroy
- * the holder directory. Hold on to it.
- */
- kobject_get(bdev->bd_holder_dir);
-
- list_add(&holder->list, &bdev->bd_holder_disks);
- goto out_unlock;
-
-out_del:
- del_symlink(disk->slave_dir, bdev_kobj(bdev));
-out_free:
- kfree(holder);
-out_unlock:
- mutex_unlock(&bdev->bd_disk->open_mutex);
- return ret;
-}
-EXPORT_SYMBOL_GPL(bd_link_disk_holder);
-
-/**
- * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
- * @bdev: the calimed slave bdev
- * @disk: the holding disk
- *
- * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
- *
- * CONTEXT:
- * Might sleep.
- */
-void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
-{
- struct bd_holder_disk *holder;
-
- mutex_lock(&bdev->bd_disk->open_mutex);
-
- holder = bd_find_holder_disk(bdev, disk);
-
- if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
- del_symlink(disk->slave_dir, bdev_kobj(bdev));
- del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj);
- kobject_put(bdev->bd_holder_dir);
- list_del_init(&holder->list);
- kfree(holder);
- }
-
- mutex_unlock(&bdev->bd_disk->open_mutex);
-}
-EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
-#endif
-
static void blkdev_flush_mapping(struct block_device *bdev)
{
WARN_ON_ONCE(bdev->bd_holders);
@@ -1262,11 +1086,8 @@ static int blkdev_get_whole(struct block_device *bdev, fmode_t mode)
}
}
- if (!bdev->bd_openers) {
+ if (!bdev->bd_openers)
set_init_blocksize(bdev);
- if (bdev->bd_bdi == &noop_backing_dev_info)
- bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info);
- }
if (test_bit(GD_NEED_PART_SCAN, &disk->state))
bdev_disk_changed(disk, false);
bdev->bd_openers++;
@@ -1284,16 +1105,14 @@ static void blkdev_put_whole(struct block_device *bdev, fmode_t mode)
static int blkdev_get_part(struct block_device *part, fmode_t mode)
{
struct gendisk *disk = part->bd_disk;
- struct block_device *whole;
int ret;
if (part->bd_openers)
goto done;
- whole = bdgrab(disk->part0);
- ret = blkdev_get_whole(whole, mode);
+ ret = blkdev_get_whole(bdev_whole(part), mode);
if (ret)
- goto out_put_whole;
+ return ret;
ret = -ENXIO;
if (!bdev_nr_sectors(part))
@@ -1301,16 +1120,12 @@ static int blkdev_get_part(struct block_device *part, fmode_t mode)
disk->open_partitions++;
set_init_blocksize(part);
- if (part->bd_bdi == &noop_backing_dev_info)
- part->bd_bdi = bdi_get(disk->queue->backing_dev_info);
done:
part->bd_openers++;
return 0;
out_blkdev_put:
- blkdev_put_whole(whole, mode);
-out_put_whole:
- bdput(whole);
+ blkdev_put_whole(bdev_whole(part), mode);
return ret;
}
@@ -1323,42 +1138,42 @@ static void blkdev_put_part(struct block_device *part, fmode_t mode)
blkdev_flush_mapping(part);
whole->bd_disk->open_partitions--;
blkdev_put_whole(whole, mode);
- bdput(whole);
}
struct block_device *blkdev_get_no_open(dev_t dev)
{
struct block_device *bdev;
- struct gendisk *disk;
+ struct inode *inode;
- bdev = bdget(dev);
- if (!bdev) {
+ inode = ilookup(blockdev_superblock, dev);
+ if (!inode) {
blk_request_module(dev);
- bdev = bdget(dev);
- if (!bdev)
+ inode = ilookup(blockdev_superblock, dev);
+ if (!inode)
return NULL;
}
- disk = bdev->bd_disk;
- if (!kobject_get_unless_zero(&disk_to_dev(disk)->kobj))
- goto bdput;
- if ((disk->flags & (GENHD_FL_UP | GENHD_FL_HIDDEN)) != GENHD_FL_UP)
- goto put_disk;
- if (!try_module_get(bdev->bd_disk->fops->owner))
- goto put_disk;
+ /* switch from the inode reference to a device mode one: */
+ bdev = &BDEV_I(inode)->bdev;
+ if (!kobject_get_unless_zero(&bdev->bd_device.kobj))
+ bdev = NULL;
+ iput(inode);
+
+ if (!bdev)
+ return NULL;
+ if ((bdev->bd_disk->flags & GENHD_FL_HIDDEN) ||
+ !try_module_get(bdev->bd_disk->fops->owner)) {
+ put_device(&bdev->bd_device);
+ return NULL;
+ }
+
return bdev;
-put_disk:
- put_disk(disk);
-bdput:
- bdput(bdev);
- return NULL;
}
void blkdev_put_no_open(struct block_device *bdev)
{
module_put(bdev->bd_disk->fops->owner);
- put_disk(bdev->bd_disk);
- bdput(bdev);
+ put_device(&bdev->bd_device);
}
/**
@@ -1411,7 +1226,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
mutex_lock(&disk->open_mutex);
ret = -ENXIO;
- if (!(disk->flags & GENHD_FL_UP))
+ if (!disk_live(disk))
goto abort_claiming;
if (bdev_is_partition(bdev))
ret = blkdev_get_part(bdev, mode);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 06f9f167222b..bd5689fa290e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -629,7 +629,7 @@ again:
* inode has not been flagged as nocompress. This flag can
* change at any time if we discover bad compression ratios.
*/
- if (nr_pages > 1 && inode_need_compress(BTRFS_I(inode), start, end)) {
+ if (inode_need_compress(BTRFS_I(inode), start, end)) {
WARN_ON(pages);
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages) {
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index a1e2813731d1..7e7a897ae0d3 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1395,9 +1395,11 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
ret = VM_FAULT_SIGBUS;
} else {
struct address_space *mapping = inode->i_mapping;
- struct page *page = find_or_create_page(mapping, 0,
- mapping_gfp_constraint(mapping,
- ~__GFP_FS));
+ struct page *page;
+
+ filemap_invalidate_lock_shared(mapping);
+ page = find_or_create_page(mapping, 0,
+ mapping_gfp_constraint(mapping, ~__GFP_FS));
if (!page) {
ret = VM_FAULT_OOM;
goto out_inline;
@@ -1418,6 +1420,7 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
vmf->page = page;
ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
out_inline:
+ filemap_invalidate_unlock_shared(mapping);
dout("filemap_fault %p %llu read inline data ret %x\n",
inode, off, ret);
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 2a2900903f8c..39db97f149b9 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1743,7 +1743,11 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
struct ceph_cap_flush *ceph_alloc_cap_flush(void)
{
- return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
+ struct ceph_cap_flush *cf;
+
+ cf = kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
+ cf->is_capsnap = false;
+ return cf;
}
void ceph_free_cap_flush(struct ceph_cap_flush *cf)
@@ -1778,7 +1782,7 @@ static bool __detach_cap_flush_from_mdsc(struct ceph_mds_client *mdsc,
prev->wake = true;
wake = false;
}
- list_del(&cf->g_list);
+ list_del_init(&cf->g_list);
return wake;
}
@@ -1793,7 +1797,7 @@ static bool __detach_cap_flush_from_ci(struct ceph_inode_info *ci,
prev->wake = true;
wake = false;
}
- list_del(&cf->i_list);
+ list_del_init(&cf->i_list);
return wake;
}
@@ -2352,7 +2356,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) {
- if (!cf->caps) {
+ if (cf->is_capsnap) {
last_snap_flush = cf->tid;
break;
}
@@ -2371,7 +2375,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
first_tid = cf->tid + 1;
- if (cf->caps) {
+ if (!cf->is_capsnap) {
struct cap_msg_args arg;
dout("kick_flushing_caps %p cap %p tid %llu %s\n",
@@ -3516,7 +3520,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
cleaned = cf->caps;
/* Is this a capsnap? */
- if (cf->caps == 0)
+ if (cf->is_capsnap)
continue;
if (cf->tid <= flush_tid) {
@@ -3589,8 +3593,9 @@ out:
while (!list_empty(&to_remove)) {
cf = list_first_entry(&to_remove,
struct ceph_cap_flush, i_list);
- list_del(&cf->i_list);
- ceph_free_cap_flush(cf);
+ list_del_init(&cf->i_list);
+ if (!cf->is_capsnap)
+ ceph_free_cap_flush(cf);
}
if (wake_ci)
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d1755ac1d964..e1d605a02d4a 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -2088,6 +2088,7 @@ static long ceph_fallocate(struct file *file, int mode,
if (ret < 0)
goto unlock;
+ filemap_invalidate_lock(inode->i_mapping);
ceph_zero_pagecache_range(inode, offset, length);
ret = ceph_zero_objects(inode, offset, length);
@@ -2100,6 +2101,7 @@ static long ceph_fallocate(struct file *file, int mode,
if (dirty)
__mark_inode_dirty(inode, dirty);
}
+ filemap_invalidate_unlock(inode->i_mapping);
ceph_put_cap_refs(ci, got);
unlock:
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index fa8a847743d0..bdeb271f47d9 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -240,9 +240,6 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
if (!(fl->fl_flags & FL_POSIX))
return -ENOLCK;
- /* No mandatory locks */
- if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
- return -ENOLCK;
dout("ceph_lock, fl_owner: %p\n", fl->fl_owner);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index afdc20213876..0b69aec23e5c 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1616,7 +1616,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
spin_lock(&mdsc->cap_dirty_lock);
list_for_each_entry(cf, &to_remove, i_list)
- list_del(&cf->g_list);
+ list_del_init(&cf->g_list);
if (!list_empty(&ci->i_dirty_item)) {
pr_warn_ratelimited(
@@ -1668,8 +1668,9 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
struct ceph_cap_flush *cf;
cf = list_first_entry(&to_remove,
struct ceph_cap_flush, i_list);
- list_del(&cf->i_list);
- ceph_free_cap_flush(cf);
+ list_del_init(&cf->i_list);
+ if (!cf->is_capsnap)
+ ceph_free_cap_flush(cf);
}
wake_up_all(&ci->i_cap_wq);
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index abd9af7727ad..3c444b9cb17b 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -394,9 +394,11 @@ void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
{
int i;
- for (i = 0; i < m->possible_max_rank; i++)
- kfree(m->m_info[i].export_targets);
- kfree(m->m_info);
+ if (m->m_info) {
+ for (i = 0; i < m->possible_max_rank; i++)
+ kfree(m->m_info[i].export_targets);
+ kfree(m->m_info);
+ }
kfree(m->m_data_pg_pools);
kfree(m);
}
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 4c6bd1042c94..15105f9da3fd 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -487,6 +487,9 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci)
pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
return;
}
+ capsnap->cap_flush.is_capsnap = true;
+ INIT_LIST_HEAD(&capsnap->cap_flush.i_list);
+ INIT_LIST_HEAD(&capsnap->cap_flush.g_list);
spin_lock(&ci->i_ceph_lock);
used = __ceph_caps_used(ci);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 9215a2f4535c..b1a363641beb 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -182,8 +182,9 @@ struct ceph_cap {
struct ceph_cap_flush {
u64 tid;
- int caps; /* 0 means capsnap */
+ int caps;
bool wake; /* wake up flush waiters when finish ? */
+ bool is_capsnap; /* true means capsnap */
struct list_head g_list; // global
struct list_head i_list; // per inode
};
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 2dfd0d8297eb..ddc0e8f97872 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -3590,6 +3590,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
return rc;
}
+ filemap_invalidate_lock(inode->i_mapping);
/*
* We implement the punch hole through ioctl, so we need remove the page
* caches first, otherwise the data may be inconsistent with the server.
@@ -3607,6 +3608,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
sizeof(struct file_zero_data_information),
CIFSMaxBufSize, NULL, NULL);
free_xid(xid);
+ filemap_invalidate_unlock(inode->i_mapping);
return rc;
}
diff --git a/fs/eventfd.c b/fs/eventfd.c
index e265b6dd4f34..3627dd7d25db 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -25,8 +25,6 @@
#include <linux/idr.h>
#include <linux/uio.h>
-DEFINE_PER_CPU(int, eventfd_wake_count);
-
static DEFINE_IDA(eventfd_ida);
struct eventfd_ctx {
@@ -67,21 +65,21 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
* Deadlock or stack overflow issues can happen if we recurse here
* through waitqueue wakeup handlers. If the caller users potentially
* nested waitqueues with custom wakeup handlers, then it should
- * check eventfd_signal_count() before calling this function. If
- * it returns true, the eventfd_signal() call should be deferred to a
+ * check eventfd_signal_allowed() before calling this function. If
+ * it returns false, the eventfd_signal() call should be deferred to a
* safe context.
*/
- if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
+ if (WARN_ON_ONCE(current->in_eventfd_signal))
return 0;
spin_lock_irqsave(&ctx->wqh.lock, flags);
- this_cpu_inc(eventfd_wake_count);
+ current->in_eventfd_signal = 1;
if (ULLONG_MAX - ctx->count < n)
n = ULLONG_MAX - ctx->count;
ctx->count += n;
if (waitqueue_active(&ctx->wqh))
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
- this_cpu_dec(eventfd_wake_count);
+ current->in_eventfd_signal = 0;
spin_unlock_irqrestore(&ctx->wqh.lock, flags);
return n;
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
index 54eec9185627..1248ff4ef562 100644
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config EXT2_FS
tristate "Second extended fs support"
+ select FS_IOMAP
help
Ext2 is a standard Linux file system for hard disks.
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index e512630cb63e..3be9dd6412b7 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -667,9 +667,6 @@ struct ext2_inode_info {
struct rw_semaphore xattr_sem;
#endif
rwlock_t i_meta_lock;
-#ifdef CONFIG_FS_DAX
- struct rw_semaphore dax_sem;
-#endif
/*
* truncate_mutex is for serialising ext2_truncate() against
@@ -685,14 +682,6 @@ struct ext2_inode_info {
#endif
};
-#ifdef CONFIG_FS_DAX
-#define dax_sem_down_write(ext2_inode) down_write(&(ext2_inode)->dax_sem)
-#define dax_sem_up_write(ext2_inode) up_write(&(ext2_inode)->dax_sem)
-#else
-#define dax_sem_down_write(ext2_inode)
-#define dax_sem_up_write(ext2_inode)
-#endif
-
/*
* Inode dynamic state flags
*/
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index f98466acc672..eb97aa3d700e 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -81,7 +81,7 @@ out_unlock:
*
* mmap_lock (MM)
* sb_start_pagefault (vfs, freeze)
- * ext2_inode_info->dax_sem
+ * address_space->invalidate_lock
* address_space->i_mmap_rwsem or page_lock (mutually exclusive in DAX)
* ext2_inode_info->truncate_mutex
*
@@ -91,7 +91,6 @@ out_unlock:
static vm_fault_t ext2_dax_fault(struct vm_fault *vmf)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
- struct ext2_inode_info *ei = EXT2_I(inode);
vm_fault_t ret;
bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
(vmf->vma->vm_flags & VM_SHARED);
@@ -100,11 +99,11 @@ static vm_fault_t ext2_dax_fault(struct vm_fault *vmf)
sb_start_pagefault(inode->i_sb);
file_update_time(vmf->vma->vm_file);
}
- down_read(&ei->dax_sem);
+ filemap_invalidate_lock_shared(inode->i_mapping);
ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, NULL, &ext2_iomap_ops);
- up_read(&ei->dax_sem);
+ filemap_invalidate_unlock_shared(inode->i_mapping);
if (write)
sb_end_pagefault(inode->i_sb);
return ret;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index dadb121beb22..333fa62661d5 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -799,7 +799,6 @@ int ext2_get_block(struct inode *inode, sector_t iblock,
}
-#ifdef CONFIG_FS_DAX
static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned flags, struct iomap *iomap, struct iomap *srcmap)
{
@@ -852,16 +851,18 @@ const struct iomap_ops ext2_iomap_ops = {
.iomap_begin = ext2_iomap_begin,
.iomap_end = ext2_iomap_end,
};
-#else
-/* Define empty ops for !CONFIG_FS_DAX case to avoid ugly ifdefs */
-const struct iomap_ops ext2_iomap_ops;
-#endif /* CONFIG_FS_DAX */
int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
- return generic_block_fiemap(inode, fieinfo, start, len,
- ext2_get_block);
+ int ret;
+
+ inode_lock(inode);
+ len = min_t(u64, len, i_size_read(inode));
+ ret = iomap_fiemap(inode, fieinfo, start, len, &ext2_iomap_ops);
+ inode_unlock(inode);
+
+ return ret;
}
static int ext2_writepage(struct page *page, struct writeback_control *wbc)
@@ -1177,7 +1178,7 @@ static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int de
ext2_free_data(inode, p, q);
}
-/* dax_sem must be held when calling this function */
+/* mapping->invalidate_lock must be held when calling this function */
static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
{
__le32 *i_data = EXT2_I(inode)->i_data;
@@ -1194,7 +1195,7 @@ static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
#ifdef CONFIG_FS_DAX
- WARN_ON(!rwsem_is_locked(&ei->dax_sem));
+ WARN_ON(!rwsem_is_locked(&inode->i_mapping->invalidate_lock));
#endif
n = ext2_block_to_path(inode, iblock, offsets, NULL);
@@ -1276,9 +1277,9 @@ static void ext2_truncate_blocks(struct inode *inode, loff_t offset)
if (ext2_inode_is_fast_symlink(inode))
return;
- dax_sem_down_write(EXT2_I(inode));
+ filemap_invalidate_lock(inode->i_mapping);
__ext2_truncate_blocks(inode, offset);
- dax_sem_up_write(EXT2_I(inode));
+ filemap_invalidate_unlock(inode->i_mapping);
}
static int ext2_setsize(struct inode *inode, loff_t newsize)
@@ -1308,10 +1309,10 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
if (error)
return error;
- dax_sem_down_write(EXT2_I(inode));
+ filemap_invalidate_lock(inode->i_mapping);
truncate_setsize(inode, newsize);
__ext2_truncate_blocks(inode, newsize);
- dax_sem_up_write(EXT2_I(inode));
+ filemap_invalidate_unlock(inode->i_mapping);
inode->i_mtime = inode->i_ctime = current_time(inode);
if (inode_needs_sync(inode)) {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 21e09fbaa46f..987bcf32ed46 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -206,9 +206,6 @@ static void init_once(void *foo)
init_rwsem(&ei->xattr_sem);
#endif
mutex_init(&ei->truncate_mutex);
-#ifdef CONFIG_FS_DAX
- init_rwsem(&ei->dax_sem);
-#endif
inode_init_once(&ei->vfs_inode);
}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3c51e243450d..7ebaf66b6e31 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1086,15 +1086,6 @@ struct ext4_inode_info {
* by other means, so we have i_data_sem.
*/
struct rw_semaphore i_data_sem;
- /*
- * i_mmap_sem is for serializing page faults with truncate / punch hole
- * operations. We have to make sure that new page cannot be faulted in
- * a section of the inode that is being punched. We cannot easily use
- * i_data_sem for this since we need protection for the whole punch
- * operation and i_data_sem ranks below transaction start so we have
- * to occasionally drop it.
- */
- struct rw_semaphore i_mmap_sem;
struct inode vfs_inode;
struct jbd2_inode *jinode;
@@ -2972,7 +2963,6 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
loff_t lstart, loff_t lend);
extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);
-extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
extern void ext4_da_release_space(struct inode *inode, int to_free);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 92ad64b89d9b..c33e0a2cb6c3 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4474,6 +4474,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
loff_t len, int mode)
{
struct inode *inode = file_inode(file);
+ struct address_space *mapping = file->f_mapping;
handle_t *handle = NULL;
unsigned int max_blocks;
loff_t new_size = 0;
@@ -4560,17 +4561,17 @@ static long ext4_zero_range(struct file *file, loff_t offset,
* Prevent page faults from reinstantiating pages we have
* released from page cache.
*/
- down_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(mapping);
ret = ext4_break_layouts(inode);
if (ret) {
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
goto out_mutex;
}
ret = ext4_update_disksize_before_punch(inode, offset, len);
if (ret) {
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
goto out_mutex;
}
/* Now release the pages and zero block aligned part of pages */
@@ -4579,7 +4580,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
flags);
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
if (ret)
goto out_mutex;
}
@@ -5221,6 +5222,7 @@ out:
static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
{
struct super_block *sb = inode->i_sb;
+ struct address_space *mapping = inode->i_mapping;
ext4_lblk_t punch_start, punch_stop;
handle_t *handle;
unsigned int credits;
@@ -5274,7 +5276,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
* Prevent page faults from reinstantiating pages we have released from
* page cache.
*/
- down_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(mapping);
ret = ext4_break_layouts(inode);
if (ret)
@@ -5289,15 +5291,15 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
* Write tail of the last page before removed range since it will get
* removed from the page cache below.
*/
- ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset);
+ ret = filemap_write_and_wait_range(mapping, ioffset, offset);
if (ret)
goto out_mmap;
/*
* Write data that will be shifted to preserve them when discarding
* page cache below. We are also protected from pages becoming dirty
- * by i_mmap_sem.
+ * by i_rwsem and invalidate_lock.
*/
- ret = filemap_write_and_wait_range(inode->i_mapping, offset + len,
+ ret = filemap_write_and_wait_range(mapping, offset + len,
LLONG_MAX);
if (ret)
goto out_mmap;
@@ -5350,7 +5352,7 @@ out_stop:
ext4_journal_stop(handle);
ext4_fc_stop_ineligible(sb);
out_mmap:
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
out_mutex:
inode_unlock(inode);
return ret;
@@ -5367,6 +5369,7 @@ out_mutex:
static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
{
struct super_block *sb = inode->i_sb;
+ struct address_space *mapping = inode->i_mapping;
handle_t *handle;
struct ext4_ext_path *path;
struct ext4_extent *extent;
@@ -5425,7 +5428,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
* Prevent page faults from reinstantiating pages we have released from
* page cache.
*/
- down_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(mapping);
ret = ext4_break_layouts(inode);
if (ret)
@@ -5526,7 +5529,7 @@ out_stop:
ext4_journal_stop(handle);
ext4_fc_stop_ineligible(sb);
out_mmap:
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
out_mutex:
inode_unlock(inode);
return ret;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 816dedcbd541..d3b4ed91aa68 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -704,22 +704,23 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
*/
bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
(vmf->vma->vm_flags & VM_SHARED);
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
pfn_t pfn;
if (write) {
sb_start_pagefault(sb);
file_update_time(vmf->vma->vm_file);
- down_read(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock_shared(mapping);
retry:
handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
EXT4_DATA_TRANS_BLOCKS(sb));
if (IS_ERR(handle)) {
- up_read(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock_shared(mapping);
sb_end_pagefault(sb);
return VM_FAULT_SIGBUS;
}
} else {
- down_read(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock_shared(mapping);
}
result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops);
if (write) {
@@ -731,10 +732,10 @@ retry:
/* Handling synchronous page fault? */
if (result & VM_FAULT_NEEDDSYNC)
result = dax_finish_sync_fault(vmf, pe_size, pfn);
- up_read(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock_shared(mapping);
sb_end_pagefault(sb);
} else {
- up_read(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock_shared(mapping);
}
return result;
@@ -756,7 +757,7 @@ static const struct vm_operations_struct ext4_dax_vm_ops = {
#endif
static const struct vm_operations_struct ext4_file_vm_ops = {
- .fault = ext4_filemap_fault,
+ .fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = ext4_page_mkwrite,
};
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d8de607849df..325c038e7b23 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3950,20 +3950,19 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
return ret;
}
-static void ext4_wait_dax_page(struct ext4_inode_info *ei)
+static void ext4_wait_dax_page(struct inode *inode)
{
- up_write(&ei->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
schedule();
- down_write(&ei->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
}
int ext4_break_layouts(struct inode *inode)
{
- struct ext4_inode_info *ei = EXT4_I(inode);
struct page *page;
int error;
- if (WARN_ON_ONCE(!rwsem_is_locked(&ei->i_mmap_sem)))
+ if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock)))
return -EINVAL;
do {
@@ -3974,7 +3973,7 @@ int ext4_break_layouts(struct inode *inode)
error = ___wait_var_event(&page->_refcount,
atomic_read(&page->_refcount) == 1,
TASK_INTERRUPTIBLE, 0, 0,
- ext4_wait_dax_page(ei));
+ ext4_wait_dax_page(inode));
} while (error == 0);
return error;
@@ -4005,9 +4004,9 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
if (ext4_has_inline_data(inode)) {
- down_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(mapping);
ret = ext4_convert_inline_data(inode);
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
if (ret)
return ret;
}
@@ -4058,7 +4057,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
* Prevent page faults from reinstantiating pages we have released from
* page cache.
*/
- down_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(mapping);
ret = ext4_break_layouts(inode);
if (ret)
@@ -4131,7 +4130,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
out_stop:
ext4_journal_stop(handle);
out_dio:
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
out_mutex:
inode_unlock(inode);
return ret;
@@ -5426,11 +5425,11 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
inode_dio_wait(inode);
}
- down_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
rc = ext4_break_layouts(inode);
if (rc) {
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
goto err_out;
}
@@ -5506,7 +5505,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
error = rc;
}
out_mmap_sem:
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
}
if (!error) {
@@ -5983,10 +5982,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
* data (and journalled aops don't know how to handle these cases).
*/
if (val) {
- down_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
err = filemap_write_and_wait(inode->i_mapping);
if (err < 0) {
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
return err;
}
}
@@ -6019,7 +6018,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
percpu_up_write(&sbi->s_writepages_rwsem);
if (val)
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
/* Finally we can mark the inode as dirty. */
@@ -6063,7 +6062,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
sb_start_pagefault(inode->i_sb);
file_update_time(vma->vm_file);
- down_read(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock_shared(mapping);
err = ext4_convert_inline_data(inode);
if (err)
@@ -6176,7 +6175,7 @@ retry_alloc:
out_ret:
ret = block_page_mkwrite_return(err);
out:
- up_read(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock_shared(mapping);
sb_end_pagefault(inode->i_sb);
return ret;
out_error:
@@ -6184,15 +6183,3 @@ out_error:
ext4_journal_stop(handle);
goto out;
}
-
-vm_fault_t ext4_filemap_fault(struct vm_fault *vmf)
-{
- struct inode *inode = file_inode(vmf->vma->vm_file);
- vm_fault_t ret;
-
- down_read(&EXT4_I(inode)->i_mmap_sem);
- ret = filemap_fault(vmf);
- up_read(&EXT4_I(inode)->i_mmap_sem);
-
- return ret;
-}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 6eed6170aded..4fb5fe083c2b 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -148,7 +148,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
goto journal_err_out;
}
- down_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
err = filemap_write_and_wait(inode->i_mapping);
if (err)
goto err_out;
@@ -256,7 +256,7 @@ err_out1:
ext4_double_up_write_data_sem(inode, inode_bl);
err_out:
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
journal_err_out:
unlock_two_nondirectories(inode, inode_bl);
iput(inode_bl);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dfa09a277b56..d6df62fc810c 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -90,12 +90,9 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
/*
* Lock ordering
*
- * Note the difference between i_mmap_sem (EXT4_I(inode)->i_mmap_sem) and
- * i_mmap_rwsem (inode->i_mmap_rwsem)!
- *
* page fault path:
- * mmap_lock -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start ->
- * page lock -> i_data_sem (rw)
+ * mmap_lock -> sb_start_pagefault -> invalidate_lock (r) -> transaction start
+ * -> page lock -> i_data_sem (rw)
*
* buffered write path:
* sb_start_write -> i_mutex -> mmap_lock
@@ -103,8 +100,9 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
* i_data_sem (rw)
*
* truncate:
- * sb_start_write -> i_mutex -> i_mmap_sem (w) -> i_mmap_rwsem (w) -> page lock
- * sb_start_write -> i_mutex -> i_mmap_sem (w) -> transaction start ->
+ * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) ->
+ * page lock
+ * sb_start_write -> i_mutex -> invalidate_lock (w) -> transaction start ->
* i_data_sem (rw)
*
* direct IO:
@@ -1360,7 +1358,6 @@ static void init_once(void *foo)
INIT_LIST_HEAD(&ei->i_orphan);
init_rwsem(&ei->xattr_sem);
init_rwsem(&ei->i_data_sem);
- init_rwsem(&ei->i_mmap_sem);
inode_init_once(&ei->vfs_inode);
ext4_fc_init_inode(&ei->vfs_inode);
}
diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h
index bcbe3668c1d4..ce84aa2786c7 100644
--- a/fs/ext4/truncate.h
+++ b/fs/ext4/truncate.h
@@ -11,14 +11,16 @@
*/
static inline void ext4_truncate_failed_write(struct inode *inode)
{
+ struct address_space *mapping = inode->i_mapping;
+
/*
* We don't need to call ext4_break_layouts() because the blocks we
* are truncating were never visible to userspace.
*/
- down_write(&EXT4_I(inode)->i_mmap_sem);
- truncate_inode_pages(inode->i_mapping, inode->i_size);
+ filemap_invalidate_lock(mapping);
+ truncate_inode_pages(mapping, inode->i_size);
ext4_truncate(inode);
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
}
/*
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index d2cf48c5a2e4..eb222b35edef 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -3187,12 +3187,12 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to)
/* In the fs-verity case, f2fs_end_enable_verity() does the truncate */
if (to > i_size && !f2fs_verity_in_progress(inode)) {
down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- down_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(mapping);
truncate_pagecache(inode, i_size);
f2fs_truncate_blocks(inode, i_size, true);
- up_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
}
}
@@ -3852,7 +3852,7 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk,
int ret = 0;
down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- down_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
set_inode_flag(inode, FI_ALIGNED_WRITE);
@@ -3894,7 +3894,7 @@ done:
clear_inode_flag(inode, FI_DO_DEFRAG);
clear_inode_flag(inode, FI_ALIGNED_WRITE);
- up_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
return ret;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index ee8eb33e2c25..906b2c4b50e7 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -754,7 +754,6 @@ struct f2fs_inode_info {
/* avoid racing between foreground op and gc */
struct rw_semaphore i_gc_rwsem[2];
- struct rw_semaphore i_mmap_sem;
struct rw_semaphore i_xattr_sem; /* avoid racing between reading and changing EAs */
int i_extra_isize; /* size of extra space located in i_addr */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 6afd4562335f..1ff333755721 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -38,10 +38,7 @@ static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf)
struct inode *inode = file_inode(vmf->vma->vm_file);
vm_fault_t ret;
- down_read(&F2FS_I(inode)->i_mmap_sem);
ret = filemap_fault(vmf);
- up_read(&F2FS_I(inode)->i_mmap_sem);
-
if (!ret)
f2fs_update_iostat(F2FS_I_SB(inode), APP_MAPPED_READ_IO,
F2FS_BLKSIZE);
@@ -101,7 +98,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
f2fs_bug_on(sbi, f2fs_has_inline_data(inode));
file_update_time(vmf->vma->vm_file);
- down_read(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock_shared(inode->i_mapping);
lock_page(page);
if (unlikely(page->mapping != inode->i_mapping ||
page_offset(page) > i_size_read(inode) ||
@@ -159,7 +156,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
trace_f2fs_vm_page_mkwrite(page, DATA);
out_sem:
- up_read(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock_shared(inode->i_mapping);
sb_end_pagefault(inode->i_sb);
err:
@@ -940,7 +937,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
}
down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- down_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
truncate_setsize(inode, attr->ia_size);
@@ -950,7 +947,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
* do not trim all blocks after i_size if target size is
* larger than i_size.
*/
- up_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
if (err)
return err;
@@ -1095,7 +1092,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
blk_end = (loff_t)pg_end << PAGE_SHIFT;
down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- down_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(mapping);
truncate_inode_pages_range(mapping, blk_start,
blk_end - 1);
@@ -1104,7 +1101,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
ret = f2fs_truncate_hole(inode, pg_start, pg_end);
f2fs_unlock_op(sbi);
- up_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
}
}
@@ -1339,7 +1336,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len)
/* avoid gc operation during block exchange */
down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- down_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
f2fs_lock_op(sbi);
f2fs_drop_extent_tree(inode);
@@ -1347,7 +1344,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len)
ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true);
f2fs_unlock_op(sbi);
- up_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
return ret;
}
@@ -1378,13 +1375,13 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
return ret;
/* write out all moved pages, if possible */
- down_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
truncate_pagecache(inode, offset);
new_size = i_size_read(inode) - len;
ret = f2fs_truncate_blocks(inode, new_size, true);
- up_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
if (!ret)
f2fs_i_size_write(inode, new_size);
return ret;
@@ -1484,7 +1481,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
pgoff_t end;
down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- down_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(mapping);
truncate_pagecache_range(inode,
(loff_t)index << PAGE_SHIFT,
@@ -1496,7 +1493,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
ret = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE);
if (ret) {
f2fs_unlock_op(sbi);
- up_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
goto out;
}
@@ -1508,7 +1505,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
f2fs_put_dnode(&dn);
f2fs_unlock_op(sbi);
- up_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
f2fs_balance_fs(sbi, dn.node_changed);
@@ -1543,6 +1540,7 @@ out:
static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct address_space *mapping = inode->i_mapping;
pgoff_t nr, pg_start, pg_end, delta, idx;
loff_t new_size;
int ret = 0;
@@ -1565,14 +1563,14 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
f2fs_balance_fs(sbi, true);
- down_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(mapping);
ret = f2fs_truncate_blocks(inode, i_size_read(inode), true);
- up_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
if (ret)
return ret;
/* write out all dirty pages from offset */
- ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
+ ret = filemap_write_and_wait_range(mapping, offset, LLONG_MAX);
if (ret)
return ret;
@@ -1583,7 +1581,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
/* avoid gc operation during block exchange */
down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- down_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(mapping);
truncate_pagecache(inode, offset);
while (!ret && idx > pg_start) {
@@ -1599,14 +1597,14 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
idx + delta, nr, false);
f2fs_unlock_op(sbi);
}
- up_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
/* write out all moved pages, if possible */
- down_write(&F2FS_I(inode)->i_mmap_sem);
- filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
+ filemap_invalidate_lock(mapping);
+ filemap_write_and_wait_range(mapping, offset, LLONG_MAX);
truncate_pagecache(inode, offset);
- up_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
if (!ret)
f2fs_i_size_write(inode, new_size);
@@ -3440,7 +3438,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
goto out;
down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- down_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
@@ -3476,7 +3474,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
}
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- up_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
out:
inode_unlock(inode);
@@ -3593,7 +3591,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
}
down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- down_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
@@ -3629,7 +3627,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
}
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- up_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
if (ret >= 0) {
clear_inode_flag(inode, FI_COMPRESS_RELEASED);
@@ -3748,7 +3746,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg)
goto err;
down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- down_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(mapping);
ret = filemap_write_and_wait_range(mapping, range.start,
to_end ? LLONG_MAX : end_addr - 1);
@@ -3835,7 +3833,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg)
ret = f2fs_secure_erase(prev_bdev, inode, prev_index,
prev_block, len, range.flags);
out:
- up_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
err:
inode_unlock(inode);
@@ -4313,9 +4311,9 @@ write:
/* if we couldn't write data, we should deallocate blocks. */
if (preallocated && i_size_read(inode) < target_size) {
down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- down_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
f2fs_truncate(inode);
- up_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 8fecd3050ccd..ce2ab1b85c11 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1289,7 +1289,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
mutex_init(&fi->inmem_lock);
init_rwsem(&fi->i_gc_rwsem[READ]);
init_rwsem(&fi->i_gc_rwsem[WRITE]);
- init_rwsem(&fi->i_mmap_sem);
init_rwsem(&fi->i_xattr_sem);
/* Will be used by directory only */
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 6642246206bd..daad532a4e2b 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -378,7 +378,7 @@ out:
ret = kstrtol(name, 10, &data);
if (ret)
return ret;
- if (data >= IOPRIO_BE_NR || data < 0)
+ if (data >= IOPRIO_NR_LEVELS || data < 0)
return -EINVAL;
cprc->ckpt_thread_ioprio = IOPRIO_PRIO_VALUE(class, data);
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 860e884e56e8..978ac6751aeb 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -5,6 +5,7 @@
#include <linux/blkdev.h>
#include <linux/sched/signal.h>
+#include <linux/backing-dev-defs.h>
#include "fat.h"
struct fatent_operations {
diff --git a/fs/fcntl.c b/fs/fcntl.c
index f946bec8f1f1..68added37c15 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -150,7 +150,8 @@ void f_delown(struct file *filp)
pid_t f_getown(struct file *filp)
{
pid_t pid = 0;
- read_lock(&filp->f_owner.lock);
+
+ read_lock_irq(&filp->f_owner.lock);
rcu_read_lock();
if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) {
pid = pid_vnr(filp->f_owner.pid);
@@ -158,7 +159,7 @@ pid_t f_getown(struct file *filp)
pid = -pid;
}
rcu_read_unlock();
- read_unlock(&filp->f_owner.lock);
+ read_unlock_irq(&filp->f_owner.lock);
return pid;
}
@@ -208,7 +209,7 @@ static int f_getown_ex(struct file *filp, unsigned long arg)
struct f_owner_ex owner = {};
int ret = 0;
- read_lock(&filp->f_owner.lock);
+ read_lock_irq(&filp->f_owner.lock);
rcu_read_lock();
if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type))
owner.pid = pid_vnr(filp->f_owner.pid);
@@ -231,7 +232,7 @@ static int f_getown_ex(struct file *filp, unsigned long arg)
ret = -EINVAL;
break;
}
- read_unlock(&filp->f_owner.lock);
+ read_unlock_irq(&filp->f_owner.lock);
if (!ret) {
ret = copy_to_user(owner_p, &owner, sizeof(owner));
@@ -249,10 +250,10 @@ static int f_getowner_uids(struct file *filp, unsigned long arg)
uid_t src[2];
int err;
- read_lock(&filp->f_owner.lock);
+ read_lock_irq(&filp->f_owner.lock);
src[0] = from_kuid(user_ns, filp->f_owner.uid);
src[1] = from_kuid(user_ns, filp->f_owner.euid);
- read_unlock(&filp->f_owner.lock);
+ read_unlock_irq(&filp->f_owner.lock);
err = put_user(src[0], &dst[0]);
err |= put_user(src[1], &dst[1]);
@@ -1003,13 +1004,14 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
{
while (fa) {
struct fown_struct *fown;
+ unsigned long flags;
if (fa->magic != FASYNC_MAGIC) {
printk(KERN_ERR "kill_fasync: bad magic number in "
"fasync_struct!\n");
return;
}
- read_lock(&fa->fa_lock);
+ read_lock_irqsave(&fa->fa_lock, flags);
if (fa->fa_file) {
fown = &fa->fa_file->f_owner;
/* Don't send SIGURG to processes which have not set a
@@ -1018,7 +1020,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
if (!(sig == SIGURG && fown->signum == 0))
send_sigio(fown, fa->fa_fd, band);
}
- read_unlock(&fa->fa_lock);
+ read_unlock_irqrestore(&fa->fa_lock, flags);
fa = rcu_dereference(fa->fa_next);
}
}
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index 9d58371d22c2..281d79f8b3d3 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -444,12 +444,12 @@ static int fuse_setup_new_dax_mapping(struct inode *inode, loff_t pos,
/*
* Can't do inline reclaim in fault path. We call
* dax_layout_busy_page() before we free a range. And
- * fuse_wait_dax_page() drops fi->i_mmap_sem lock and requires it.
- * In fault path we enter with fi->i_mmap_sem held and can't drop
- * it. Also in fault path we hold fi->i_mmap_sem shared and not
- * exclusive, so that creates further issues with fuse_wait_dax_page().
- * Hence return -EAGAIN and fuse_dax_fault() will wait for a memory
- * range to become free and retry.
+ * fuse_wait_dax_page() drops mapping->invalidate_lock and requires it.
+ * In fault path we enter with mapping->invalidate_lock held and can't
+ * drop it. Also in fault path we hold mapping->invalidate_lock shared
+ * and not exclusive, so that creates further issues with
+ * fuse_wait_dax_page(). Hence return -EAGAIN and fuse_dax_fault()
+ * will wait for a memory range to become free and retry.
*/
if (flags & IOMAP_FAULT) {
alloc_dmap = alloc_dax_mapping(fcd);
@@ -513,7 +513,7 @@ static int fuse_upgrade_dax_mapping(struct inode *inode, loff_t pos,
down_write(&fi->dax->sem);
node = interval_tree_iter_first(&fi->dax->tree, idx, idx);
- /* We are holding either inode lock or i_mmap_sem, and that should
+ /* We are holding either inode lock or invalidate_lock, and that should
* ensure that dmap can't be truncated. We are holding a reference
* on dmap and that should make sure it can't be reclaimed. So dmap
* should still be there in tree despite the fact we dropped and
@@ -660,14 +660,12 @@ static const struct iomap_ops fuse_iomap_ops = {
static void fuse_wait_dax_page(struct inode *inode)
{
- struct fuse_inode *fi = get_fuse_inode(inode);
-
- up_write(&fi->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
schedule();
- down_write(&fi->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
}
-/* Should be called with fi->i_mmap_sem lock held exclusively */
+/* Should be called with mapping->invalidate_lock held exclusively */
static int __fuse_dax_break_layouts(struct inode *inode, bool *retry,
loff_t start, loff_t end)
{
@@ -813,18 +811,18 @@ retry:
* we do not want any read/write/mmap to make progress and try
* to populate page cache or access memory we are trying to free.
*/
- down_read(&get_fuse_inode(inode)->i_mmap_sem);
+ filemap_invalidate_lock_shared(inode->i_mapping);
ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops);
if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) {
error = 0;
retry = true;
- up_read(&get_fuse_inode(inode)->i_mmap_sem);
+ filemap_invalidate_unlock_shared(inode->i_mapping);
goto retry;
}
if (ret & VM_FAULT_NEEDDSYNC)
ret = dax_finish_sync_fault(vmf, pe_size, pfn);
- up_read(&get_fuse_inode(inode)->i_mmap_sem);
+ filemap_invalidate_unlock_shared(inode->i_mapping);
if (write)
sb_end_pagefault(sb);
@@ -960,7 +958,7 @@ inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode,
int ret;
struct interval_tree_node *node;
- down_write(&fi->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
/* Lookup a dmap and corresponding file offset to reclaim. */
down_read(&fi->dax->sem);
@@ -1021,7 +1019,7 @@ inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode,
out_write_dmap_sem:
up_write(&fi->dax->sem);
out_mmap_sem:
- up_write(&fi->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
return dmap;
}
@@ -1050,10 +1048,10 @@ alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode)
* had a reference or some other temporary failure,
* Try again. We want to give up inline reclaim only
* if there is no range assigned to this node. Otherwise
- * if a deadlock is possible if we sleep with fi->i_mmap_sem
- * held and worker to free memory can't make progress due
- * to unavailability of fi->i_mmap_sem lock. So sleep
- * only if fi->dax->nr=0
+ * if a deadlock is possible if we sleep with
+ * mapping->invalidate_lock held and worker to free memory
+ * can't make progress due to unavailability of
+ * mapping->invalidate_lock. So sleep only if fi->dax->nr=0
*/
if (retry)
continue;
@@ -1061,8 +1059,8 @@ alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode)
* There are no mappings which can be reclaimed. Wait for one.
* We are not holding fi->dax->sem. So it is possible
* that range gets added now. But as we are not holding
- * fi->i_mmap_sem, worker should still be able to free up
- * a range and wake us up.
+ * mapping->invalidate_lock, worker should still be able to
+ * free up a range and wake us up.
*/
if (!fi->dax->nr && !(fcd->nr_free_ranges > 0)) {
if (wait_event_killable_exclusive(fcd->range_waitq,
@@ -1108,7 +1106,7 @@ static int lookup_and_reclaim_dmap_locked(struct fuse_conn_dax *fcd,
/*
* Free a range of memory.
* Locking:
- * 1. Take fi->i_mmap_sem to block dax faults.
+ * 1. Take mapping->invalidate_lock to block dax faults.
* 2. Take fi->dax->sem to protect interval tree and also to make sure
* read/write can not reuse a dmap which we might be freeing.
*/
@@ -1122,7 +1120,7 @@ static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd,
loff_t dmap_start = start_idx << FUSE_DAX_SHIFT;
loff_t dmap_end = (dmap_start + FUSE_DAX_SZ) - 1;
- down_write(&fi->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end);
if (ret) {
pr_debug("virtio_fs: fuse_dax_break_layouts() failed. err=%d\n",
@@ -1134,7 +1132,7 @@ static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd,
ret = lookup_and_reclaim_dmap_locked(fcd, inode, start_idx);
up_write(&fi->dax->sem);
out_mmap_sem:
- up_write(&fi->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
return ret;
}
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index eade6f965b2e..d9b977c0f38d 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1556,6 +1556,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_conn *fc = fm->fc;
struct fuse_inode *fi = get_fuse_inode(inode);
+ struct address_space *mapping = inode->i_mapping;
FUSE_ARGS(args);
struct fuse_setattr_in inarg;
struct fuse_attr_out outarg;
@@ -1580,11 +1581,11 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
}
if (FUSE_IS_DAX(inode) && is_truncate) {
- down_write(&fi->i_mmap_sem);
+ filemap_invalidate_lock(mapping);
fault_blocked = true;
err = fuse_dax_break_layouts(inode, 0, 0);
if (err) {
- up_write(&fi->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
return err;
}
}
@@ -1694,13 +1695,13 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
if ((is_truncate || !is_wb) &&
S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
truncate_pagecache(inode, outarg.attr.size);
- invalidate_inode_pages2(inode->i_mapping);
+ invalidate_inode_pages2(mapping);
}
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
out:
if (fault_blocked)
- up_write(&fi->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
return 0;
@@ -1711,7 +1712,7 @@ error:
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
if (fault_blocked)
- up_write(&fi->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
return err;
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 97f860cfc195..621a662c19fb 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -243,7 +243,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
}
if (dax_truncate) {
- down_write(&get_fuse_inode(inode)->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
err = fuse_dax_break_layouts(inode, 0, 0);
if (err)
goto out;
@@ -255,7 +255,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
out:
if (dax_truncate)
- up_write(&get_fuse_inode(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
if (is_wb_truncate | dax_truncate) {
fuse_release_nowrite(inode);
@@ -2920,7 +2920,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
if (lock_inode) {
inode_lock(inode);
if (block_faults) {
- down_write(&fi->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
err = fuse_dax_break_layouts(inode, 0, 0);
if (err)
goto out;
@@ -2976,7 +2976,7 @@ out:
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
if (block_faults)
- up_write(&fi->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
if (lock_inode)
inode_unlock(inode);
@@ -3045,7 +3045,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
* modifications. Yet this does give less guarantees than if the
* copying was performed with write(2).
*
- * To fix this a i_mmap_sem style lock could be used to prevent new
+ * To fix this a mapping->invalidate_lock could be used to prevent new
* faults while the copy is ongoing.
*/
err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 07829ce78695..6fb639b97ea8 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -149,13 +149,6 @@ struct fuse_inode {
/** Lock to protect write related fields */
spinlock_t lock;
- /**
- * Can't take inode lock in fault path (leads to circular dependency).
- * Introduce another semaphore which can be taken in fault path and
- * then other filesystem paths can take this to block faults.
- */
- struct rw_semaphore i_mmap_sem;
-
#ifdef CONFIG_FUSE_DAX
/*
* Dax specific inode data
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index b9beb39a4a18..e07e429f32e1 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -85,7 +85,6 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
fi->orig_ino = 0;
fi->state = 0;
mutex_init(&fi->mutex);
- init_rwsem(&fi->i_mmap_sem);
spin_lock_init(&fi->lock);
fi->forget = fuse_alloc_forget();
if (!fi->forget)
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 84ec053d43b4..c559827cb6f9 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1237,9 +1237,6 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
if (!(fl->fl_flags & FL_POSIX))
return -ENOLCK;
- if (__mandatory_lock(&ip->i_inode) && fl->fl_type != F_UNLCK)
- return -ENOLCK;
-
if (cmd == F_CANCELLK) {
/* Hack: */
cmd = F_SETLK;
diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig
index 2b36dc6f0a10..ec975f466877 100644
--- a/fs/hpfs/Kconfig
+++ b/fs/hpfs/Kconfig
@@ -2,6 +2,7 @@
config HPFS_FS
tristate "OS/2 HPFS file system support"
depends on BLOCK
+ select FS_IOMAP
help
OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
is the file system used for organizing files on OS/2 hard disk
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index c3a49aacf20a..fb37f57130aa 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -9,6 +9,7 @@
#include "hpfs_fn.h"
#include <linux/mpage.h>
+#include <linux/iomap.h>
#include <linux/fiemap.h>
#define BLOCKS(size) (((size) + 511) >> 9)
@@ -116,6 +117,47 @@ static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_he
return r;
}
+static int hpfs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ unsigned flags, struct iomap *iomap, struct iomap *srcmap)
+{
+ struct super_block *sb = inode->i_sb;
+ unsigned int blkbits = inode->i_blkbits;
+ unsigned int n_secs;
+ secno s;
+
+ if (WARN_ON_ONCE(flags & (IOMAP_WRITE | IOMAP_ZERO)))
+ return -EINVAL;
+
+ iomap->bdev = inode->i_sb->s_bdev;
+ iomap->offset = offset;
+
+ hpfs_lock(sb);
+ s = hpfs_bmap(inode, offset >> blkbits, &n_secs);
+ if (s) {
+ n_secs = hpfs_search_hotfix_map_for_range(sb, s,
+ min_t(loff_t, n_secs, length));
+ if (unlikely(!n_secs)) {
+ s = hpfs_search_hotfix_map(sb, s);
+ n_secs = 1;
+ }
+ iomap->type = IOMAP_MAPPED;
+ iomap->flags = IOMAP_F_MERGED;
+ iomap->addr = (u64)s << blkbits;
+ iomap->length = (u64)n_secs << blkbits;
+ } else {
+ iomap->type = IOMAP_HOLE;
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->length = 1 << blkbits;
+ }
+
+ hpfs_unlock(sb);
+ return 0;
+}
+
+static const struct iomap_ops hpfs_iomap_ops = {
+ .iomap_begin = hpfs_iomap_begin,
+};
+
static int hpfs_readpage(struct file *file, struct page *page)
{
return mpage_readpage(page, hpfs_get_block);
@@ -192,7 +234,14 @@ static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block)
static int hpfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len)
{
- return generic_block_fiemap(inode, fieinfo, start, len, hpfs_get_block);
+ int ret;
+
+ inode_lock(inode);
+ len = min_t(u64, len, i_size_read(inode));
+ ret = iomap_fiemap(inode, fieinfo, start, len, &hpfs_iomap_ops);
+ inode_unlock(inode);
+
+ return ret;
}
const struct address_space_operations hpfs_aops = {
diff --git a/fs/inode.c b/fs/inode.c
index c93500d84264..84c528cd1955 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -190,6 +190,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
mapping->private_data = NULL;
mapping->writeback_index = 0;
+ __init_rwsem(&mapping->invalidate_lock, "mapping.invalidate_lock",
+ &sb->s_type->invalidate_lock_key);
inode->i_private = NULL;
inode->i_mapping = mapping;
INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 4ce83bb48021..cd9bd095fb1b 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -178,7 +178,7 @@ static void io_worker_exit(struct io_worker *worker)
complete(&worker->ref_done);
wait_for_completion(&worker->ref_done);
- raw_spin_lock_irq(&wqe->lock);
+ raw_spin_lock(&wqe->lock);
if (worker->flags & IO_WORKER_F_FREE)
hlist_nulls_del_rcu(&worker->nulls_node);
list_del_rcu(&worker->all_list);
@@ -188,7 +188,7 @@ static void io_worker_exit(struct io_worker *worker)
worker->flags = 0;
current->flags &= ~PF_IO_WORKER;
preempt_enable();
- raw_spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock(&wqe->lock);
kfree_rcu(worker, rcu);
io_worker_ref_put(wqe->wq);
@@ -254,18 +254,19 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
if (!ret) {
bool do_create = false, first = false;
- raw_spin_lock_irq(&wqe->lock);
+ raw_spin_lock(&wqe->lock);
if (acct->nr_workers < acct->max_workers) {
- atomic_inc(&acct->nr_running);
- atomic_inc(&wqe->wq->worker_refs);
if (!acct->nr_workers)
first = true;
acct->nr_workers++;
do_create = true;
}
- raw_spin_unlock_irq(&wqe->lock);
- if (do_create)
+ raw_spin_unlock(&wqe->lock);
+ if (do_create) {
+ atomic_inc(&acct->nr_running);
+ atomic_inc(&wqe->wq->worker_refs);
create_io_worker(wqe->wq, wqe, acct->index, first);
+ }
}
}
@@ -288,14 +289,14 @@ static void create_worker_cb(struct callback_head *cb)
wqe = worker->wqe;
wq = wqe->wq;
acct = &wqe->acct[worker->create_index];
- raw_spin_lock_irq(&wqe->lock);
+ raw_spin_lock(&wqe->lock);
if (acct->nr_workers < acct->max_workers) {
if (!acct->nr_workers)
first = true;
acct->nr_workers++;
do_create = true;
}
- raw_spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock(&wqe->lock);
if (do_create) {
create_io_worker(wq, wqe, worker->create_index, first);
} else {
@@ -423,7 +424,28 @@ static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
spin_unlock(&wq->hash->wait.lock);
}
-static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
+/*
+ * We can always run the work if the worker is currently the same type as
+ * the work (eg both are bound, or both are unbound). If they are not the
+ * same, only allow it if incrementing the worker count would be allowed.
+ */
+static bool io_worker_can_run_work(struct io_worker *worker,
+ struct io_wq_work *work)
+{
+ struct io_wqe_acct *acct;
+
+ if (!(worker->flags & IO_WORKER_F_BOUND) !=
+ !(work->flags & IO_WQ_WORK_UNBOUND))
+ return true;
+
+ /* not the same type, check if we'd go over the limit */
+ acct = io_work_get_acct(worker->wqe, work);
+ return acct->nr_workers < acct->max_workers;
+}
+
+static struct io_wq_work *io_get_next_work(struct io_wqe *wqe,
+ struct io_worker *worker,
+ bool *stalled)
__must_hold(wqe->lock)
{
struct io_wq_work_node *node, *prev;
@@ -435,6 +457,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
work = container_of(node, struct io_wq_work, list);
+ if (!io_worker_can_run_work(worker, work))
+ break;
+
/* not hashed, can run anytime */
if (!io_wq_is_hashed(work)) {
wq_list_del(&wqe->work_list, node, prev);
@@ -461,6 +486,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
raw_spin_unlock(&wqe->lock);
io_wait_on_hash(wqe, stall_hash);
raw_spin_lock(&wqe->lock);
+ *stalled = true;
}
return NULL;
@@ -484,9 +510,9 @@ static void io_assign_current_work(struct io_worker *worker,
cond_resched();
}
- spin_lock_irq(&worker->lock);
+ spin_lock(&worker->lock);
worker->cur_work = work;
- spin_unlock_irq(&worker->lock);
+ spin_unlock(&worker->lock);
}
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
@@ -500,6 +526,7 @@ static void io_worker_handle_work(struct io_worker *worker)
do {
struct io_wq_work *work;
+ bool stalled;
get_next:
/*
* If we got some work, mark us as busy. If we didn't, but
@@ -508,13 +535,14 @@ get_next:
* can't make progress, any work completion or insertion will
* clear the stalled flag.
*/
- work = io_get_next_work(wqe);
+ stalled = false;
+ work = io_get_next_work(wqe, worker, &stalled);
if (work)
__io_worker_busy(wqe, worker, work);
- else if (!wq_list_empty(&wqe->work_list))
+ else if (stalled)
wqe->flags |= IO_WQE_FLAG_STALLED;
- raw_spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock(&wqe->lock);
if (!work)
break;
io_assign_current_work(worker, work);
@@ -546,16 +574,16 @@ get_next:
clear_bit(hash, &wq->hash->map);
if (wq_has_sleeper(&wq->hash->wait))
wake_up(&wq->hash->wait);
- raw_spin_lock_irq(&wqe->lock);
+ raw_spin_lock(&wqe->lock);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
/* skip unnecessary unlock-lock wqe->lock */
if (!work)
goto get_next;
- raw_spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock(&wqe->lock);
}
} while (work);
- raw_spin_lock_irq(&wqe->lock);
+ raw_spin_lock(&wqe->lock);
} while (1);
}
@@ -576,13 +604,13 @@ static int io_wqe_worker(void *data)
set_current_state(TASK_INTERRUPTIBLE);
loop:
- raw_spin_lock_irq(&wqe->lock);
+ raw_spin_lock(&wqe->lock);
if (io_wqe_run_queue(wqe)) {
io_worker_handle_work(worker);
goto loop;
}
__io_worker_idle(wqe, worker);
- raw_spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock(&wqe->lock);
if (io_flush_signals())
continue;
ret = schedule_timeout(WORKER_IDLE_TIMEOUT);
@@ -601,7 +629,7 @@ loop:
}
if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
- raw_spin_lock_irq(&wqe->lock);
+ raw_spin_lock(&wqe->lock);
io_worker_handle_work(worker);
}
@@ -643,9 +671,9 @@ void io_wq_worker_sleeping(struct task_struct *tsk)
worker->flags &= ~IO_WORKER_F_RUNNING;
- raw_spin_lock_irq(&worker->wqe->lock);
+ raw_spin_lock(&worker->wqe->lock);
io_wqe_dec_running(worker);
- raw_spin_unlock_irq(&worker->wqe->lock);
+ raw_spin_unlock(&worker->wqe->lock);
}
static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index, bool first)
@@ -671,9 +699,9 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index, bo
kfree(worker);
fail:
atomic_dec(&acct->nr_running);
- raw_spin_lock_irq(&wqe->lock);
+ raw_spin_lock(&wqe->lock);
acct->nr_workers--;
- raw_spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock(&wqe->lock);
io_worker_ref_put(wq);
return;
}
@@ -683,7 +711,7 @@ fail:
set_cpus_allowed_ptr(tsk, wqe->cpu_mask);
tsk->flags |= PF_NO_SETAFFINITY;
- raw_spin_lock_irq(&wqe->lock);
+ raw_spin_lock(&wqe->lock);
hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
list_add_tail_rcu(&worker->all_list, &wqe->all_list);
worker->flags |= IO_WORKER_F_FREE;
@@ -691,7 +719,7 @@ fail:
worker->flags |= IO_WORKER_F_BOUND;
if (first && (worker->flags & IO_WORKER_F_BOUND))
worker->flags |= IO_WORKER_F_FIXED;
- raw_spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock(&wqe->lock);
wake_up_new_task(tsk);
}
@@ -766,8 +794,7 @@ append:
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
- int work_flags;
- unsigned long flags;
+ bool do_wake;
/*
* If io-wq is exiting for this task, or if the request has explicitly
@@ -779,14 +806,14 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
return;
}
- work_flags = work->flags;
- raw_spin_lock_irqsave(&wqe->lock, flags);
+ raw_spin_lock(&wqe->lock);
io_wqe_insert_work(wqe, work);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
- raw_spin_unlock_irqrestore(&wqe->lock, flags);
+ do_wake = (work->flags & IO_WQ_WORK_CONCURRENT) ||
+ !atomic_read(&acct->nr_running);
+ raw_spin_unlock(&wqe->lock);
- if ((work_flags & IO_WQ_WORK_CONCURRENT) ||
- !atomic_read(&acct->nr_running))
+ if (do_wake)
io_wqe_wake_worker(wqe, acct);
}
@@ -812,19 +839,18 @@ void io_wq_hash_work(struct io_wq_work *work, void *val)
static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
{
struct io_cb_cancel_data *match = data;
- unsigned long flags;
/*
* Hold the lock to avoid ->cur_work going out of scope, caller
* may dereference the passed in work.
*/
- spin_lock_irqsave(&worker->lock, flags);
+ spin_lock(&worker->lock);
if (worker->cur_work &&
match->fn(worker->cur_work, match->data)) {
set_notify_signal(worker->task);
match->nr_running++;
}
- spin_unlock_irqrestore(&worker->lock, flags);
+ spin_unlock(&worker->lock);
return match->nr_running && !match->cancel_all;
}
@@ -852,16 +878,15 @@ static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
{
struct io_wq_work_node *node, *prev;
struct io_wq_work *work;
- unsigned long flags;
retry:
- raw_spin_lock_irqsave(&wqe->lock, flags);
+ raw_spin_lock(&wqe->lock);
wq_list_for_each(node, prev, &wqe->work_list) {
work = container_of(node, struct io_wq_work, list);
if (!match->fn(work, match->data))
continue;
io_wqe_remove_pending(wqe, work, prev);
- raw_spin_unlock_irqrestore(&wqe->lock, flags);
+ raw_spin_unlock(&wqe->lock);
io_run_cancel(work, wqe);
match->nr_pending++;
if (!match->cancel_all)
@@ -870,7 +895,7 @@ retry:
/* not safe to continue after unlock */
goto retry;
}
- raw_spin_unlock_irqrestore(&wqe->lock, flags);
+ raw_spin_unlock(&wqe->lock);
}
static void io_wqe_cancel_running_work(struct io_wqe *wqe,
@@ -1151,6 +1176,35 @@ int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask)
return 0;
}
+/*
+ * Set max number of unbounded workers, returns old value. If new_count is 0,
+ * then just return the old value.
+ */
+int io_wq_max_workers(struct io_wq *wq, int *new_count)
+{
+ int i, node, prev = 0;
+
+ for (i = 0; i < 2; i++) {
+ if (new_count[i] > task_rlimit(current, RLIMIT_NPROC))
+ new_count[i] = task_rlimit(current, RLIMIT_NPROC);
+ }
+
+ rcu_read_lock();
+ for_each_node(node) {
+ struct io_wqe_acct *acct;
+
+ for (i = 0; i < 2; i++) {
+ acct = &wq->wqes[node]->acct[i];
+ prev = max_t(int, acct->max_workers, prev);
+ if (new_count[i])
+ acct->max_workers = new_count[i];
+ new_count[i] = prev;
+ }
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
static __init int io_wq_init(void)
{
int ret;
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 308af3928424..bf5c4c533760 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -128,6 +128,7 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
void io_wq_hash_work(struct io_wq_work *work, void *val);
int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask);
+int io_wq_max_workers(struct io_wq *wq, int *new_count);
static inline bool io_wq_is_hashed(struct io_wq_work *work)
{
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 504aede8ca47..73928d957691 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -92,12 +92,12 @@
#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
-/* 512 entries per page on 64-bit archs, 64 pages max */
+/* only define max */
#define IORING_MAX_FIXED_FILES (1U << 15)
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
IORING_REGISTER_LAST + IORING_OP_LAST)
-#define IO_RSRC_TAG_TABLE_SHIFT 9
+#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3)
#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT)
#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1)
@@ -375,6 +375,7 @@ struct io_ring_ctx {
struct io_submit_state submit_state;
struct list_head timeout_list;
+ struct list_head ltimeout_list;
struct list_head cq_overflow_list;
struct xarray io_buffers;
struct xarray personalities;
@@ -508,6 +509,7 @@ struct io_timeout_data {
struct hrtimer timer;
struct timespec64 ts;
enum hrtimer_mode mode;
+ u32 flags;
};
struct io_accept {
@@ -515,6 +517,7 @@ struct io_accept {
struct sockaddr __user *addr;
int __user *addr_len;
int flags;
+ u32 file_slot;
unsigned long nofile;
};
@@ -549,6 +552,7 @@ struct io_timeout_rem {
/* timeout update */
struct timespec64 ts;
u32 flags;
+ bool ltimeout;
};
struct io_rw {
@@ -580,6 +584,7 @@ struct io_sr_msg {
struct io_open {
struct file *file;
int dfd;
+ u32 file_slot;
struct filename *filename;
struct open_how how;
unsigned long nofile;
@@ -705,12 +710,12 @@ enum {
REQ_F_NEED_CLEANUP_BIT,
REQ_F_POLLED_BIT,
REQ_F_BUFFER_SELECTED_BIT,
- REQ_F_LTIMEOUT_ACTIVE_BIT,
REQ_F_COMPLETE_INLINE_BIT,
REQ_F_REISSUE_BIT,
REQ_F_DONT_REISSUE_BIT,
REQ_F_CREDS_BIT,
REQ_F_REFCOUNT_BIT,
+ REQ_F_ARM_LTIMEOUT_BIT,
/* keep async read/write and isreg together and in order */
REQ_F_NOWAIT_READ_BIT,
REQ_F_NOWAIT_WRITE_BIT,
@@ -750,8 +755,6 @@ enum {
REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
/* buffer already selected */
REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
- /* linked timeout is active, i.e. prepared by link's head */
- REQ_F_LTIMEOUT_ACTIVE = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
/* completion is deferred through io_comp_state */
REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT),
/* caller should reissue async */
@@ -768,6 +771,8 @@ enum {
REQ_F_CREDS = BIT(REQ_F_CREDS_BIT),
/* skip refcounting if not set */
REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT),
+ /* there is a linked timeout that has to be armed */
+ REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT),
};
struct async_poll {
@@ -775,7 +780,7 @@ struct async_poll {
struct io_poll_iocb *double_poll;
};
-typedef void (*io_req_tw_func_t)(struct io_kiocb *req);
+typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
struct io_task_work {
union {
@@ -1034,6 +1039,9 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_UNLINKAT] = {},
};
+/* requests with any of those set should undergo io_disarm_next() */
+#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
+
static bool io_disarm_next(struct io_kiocb *req);
static void io_uring_del_tctx_node(unsigned long index);
static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
@@ -1060,6 +1068,10 @@ static void io_req_task_queue(struct io_kiocb *req);
static void io_submit_flush_completions(struct io_ring_ctx *ctx);
static int io_req_prep_async(struct io_kiocb *req);
+static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
+ unsigned int issue_flags, u32 slot_index);
+static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
+
static struct kmem_cache *req_cachep;
static const struct file_operations io_uring_fops;
@@ -1077,6 +1089,14 @@ struct sock *io_uring_get_socket(struct file *file)
}
EXPORT_SYMBOL(io_uring_get_socket);
+static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
+{
+ if (!*locked) {
+ mutex_lock(&ctx->uring_lock);
+ *locked = true;
+ }
+}
+
#define io_for_each_link(pos, head) \
for (pos = (head); pos; pos = pos->link)
@@ -1115,14 +1135,19 @@ static inline void req_ref_get(struct io_kiocb *req)
atomic_inc(&req->refs);
}
-static inline void io_req_refcount(struct io_kiocb *req)
+static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
{
if (!(req->flags & REQ_F_REFCOUNT)) {
req->flags |= REQ_F_REFCOUNT;
- atomic_set(&req->refs, 1);
+ atomic_set(&req->refs, nr);
}
}
+static inline void io_req_set_refcount(struct io_kiocb *req)
+{
+ __io_req_set_refcount(req, 1);
+}
+
static inline void io_req_set_rsrc_node(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -1167,6 +1192,12 @@ static inline void req_set_fail(struct io_kiocb *req)
req->flags |= REQ_F_FAIL;
}
+static inline void req_fail_link_node(struct io_kiocb *req, int res)
+{
+ req_set_fail(req);
+ req->result = res;
+}
+
static void io_ring_ctx_ref_free(struct percpu_ref *ref)
{
struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
@@ -1185,11 +1216,19 @@ static void io_fallback_req_func(struct work_struct *work)
fallback_work.work);
struct llist_node *node = llist_del_all(&ctx->fallback_llist);
struct io_kiocb *req, *tmp;
+ bool locked = false;
percpu_ref_get(&ctx->refs);
llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
- req->io_task_work.func(req);
+ req->io_task_work.func(req, &locked);
+
+ if (locked) {
+ if (ctx->submit_state.compl_nr)
+ io_submit_flush_completions(ctx);
+ mutex_unlock(&ctx->uring_lock);
+ }
percpu_ref_put(&ctx->refs);
+
}
static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
@@ -1241,6 +1280,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->iopoll_list);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
+ INIT_LIST_HEAD(&ctx->ltimeout_list);
spin_lock_init(&ctx->rsrc_ref_lock);
INIT_LIST_HEAD(&ctx->rsrc_ref_list);
INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
@@ -1298,27 +1338,28 @@ static void io_req_track_inflight(struct io_kiocb *req)
}
}
-static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
+static inline void io_unprep_linked_timeout(struct io_kiocb *req)
{
- struct io_kiocb *nxt = req->link;
+ req->flags &= ~REQ_F_LINK_TIMEOUT;
+}
- if (req->flags & REQ_F_LINK_TIMEOUT)
+static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
+{
+ if (WARN_ON_ONCE(!req->link))
return NULL;
- /* linked timeouts should have two refs once prep'ed */
- io_req_refcount(req);
- io_req_refcount(nxt);
- req_ref_get(nxt);
-
- nxt->timeout.head = req;
- nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
+ req->flags &= ~REQ_F_ARM_LTIMEOUT;
req->flags |= REQ_F_LINK_TIMEOUT;
- return nxt;
+
+ /* linked timeouts should have two refs once prep'ed */
+ io_req_set_refcount(req);
+ __io_req_set_refcount(req->link, 2);
+ return req->link;
}
static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
{
- if (likely(!req->link || req->link->opcode != IORING_OP_LINK_TIMEOUT))
+ if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
return NULL;
return __io_prep_linked_timeout(req);
}
@@ -1372,12 +1413,15 @@ static void io_prep_async_link(struct io_kiocb *req)
}
}
-static void io_queue_async_work(struct io_kiocb *req)
+static void io_queue_async_work(struct io_kiocb *req, bool *locked)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *link = io_prep_linked_timeout(req);
struct io_uring_task *tctx = req->task->io_uring;
+ /* must not take the lock, NULL it as a precaution */
+ locked = NULL;
+
BUG_ON(!tctx);
BUG_ON(!tctx->io_wq);
@@ -1517,6 +1561,13 @@ static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
return !ctx->eventfd_async || io_wq_current_is_worker();
}
+/*
+ * This should only get called when at least one event has been posted.
+ * Some applications rely on the eventfd notification count only changing
+ * IFF a new CQE has been added to the CQ ring. There's no depedency on
+ * 1:1 relationship between how many times this function is called (and
+ * hence the eventfd count) and number of CQEs posted to the CQ ring.
+ */
static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
{
/*
@@ -1614,10 +1665,32 @@ static inline void io_put_task(struct task_struct *task, int nr)
{
struct io_uring_task *tctx = task->io_uring;
- percpu_counter_sub(&tctx->inflight, nr);
- if (unlikely(atomic_read(&tctx->in_idle)))
- wake_up(&tctx->wait);
- put_task_struct_many(task, nr);
+ if (likely(task == current)) {
+ tctx->cached_refs += nr;
+ } else {
+ percpu_counter_sub(&tctx->inflight, nr);
+ if (unlikely(atomic_read(&tctx->in_idle)))
+ wake_up(&tctx->wait);
+ put_task_struct_many(task, nr);
+ }
+}
+
+static void io_task_refs_refill(struct io_uring_task *tctx)
+{
+ unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
+
+ percpu_counter_add(&tctx->inflight, refill);
+ refcount_add(refill, &current->usage);
+ tctx->cached_refs += refill;
+}
+
+static inline void io_get_task_refs(int nr)
+{
+ struct io_uring_task *tctx = current->io_uring;
+
+ tctx->cached_refs -= nr;
+ if (unlikely(tctx->cached_refs < 0))
+ io_task_refs_refill(tctx);
}
static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
@@ -1690,7 +1763,7 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
*/
if (req_ref_put_and_test(req)) {
if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
- if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL))
+ if (req->flags & IO_DISARM_MASK)
io_disarm_next(req);
if (req->link) {
io_req_task_queue(req->link);
@@ -1891,16 +1964,13 @@ static bool io_kill_linked_timeout(struct io_kiocb *req)
{
struct io_kiocb *link = req->link;
- /*
- * Can happen if a linked timeout fired and link had been like
- * req -> link t-out -> link t-out [-> ...]
- */
- if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) {
+ if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
struct io_timeout_data *io = link->async_data;
io_remove_next_linked(req);
link->timeout.head = NULL;
if (hrtimer_try_to_cancel(&io->timer) != -1) {
+ list_del(&link->timeout.list);
io_cqring_fill_event(link->ctx, link->user_data,
-ECANCELED, 0);
io_put_req_deferred(link);
@@ -1917,11 +1987,16 @@ static void io_fail_links(struct io_kiocb *req)
req->link = NULL;
while (link) {
+ long res = -ECANCELED;
+
+ if (link->flags & REQ_F_FAIL)
+ res = link->result;
+
nxt = link->link;
link->link = NULL;
trace_io_uring_fail_link(req, link);
- io_cqring_fill_event(link->ctx, link->user_data, -ECANCELED, 0);
+ io_cqring_fill_event(link->ctx, link->user_data, res, 0);
io_put_req_deferred(link);
link = nxt;
}
@@ -1932,7 +2007,18 @@ static bool io_disarm_next(struct io_kiocb *req)
{
bool posted = false;
- if (likely(req->flags & REQ_F_LINK_TIMEOUT)) {
+ if (req->flags & REQ_F_ARM_LTIMEOUT) {
+ struct io_kiocb *link = req->link;
+
+ req->flags &= ~REQ_F_ARM_LTIMEOUT;
+ if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
+ io_remove_next_linked(req);
+ io_cqring_fill_event(link->ctx, link->user_data,
+ -ECANCELED, 0);
+ io_put_req_deferred(link);
+ posted = true;
+ }
+ } else if (req->flags & REQ_F_LINK_TIMEOUT) {
struct io_ring_ctx *ctx = req->ctx;
spin_lock_irq(&ctx->timeout_lock);
@@ -1957,7 +2043,7 @@ static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
* dependencies to the next request. In case of failure, fail the rest
* of the chain.
*/
- if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL)) {
+ if (req->flags & IO_DISARM_MASK) {
struct io_ring_ctx *ctx = req->ctx;
bool posted;
@@ -1981,20 +2067,22 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
return __io_req_find_next(req);
}
-static void ctx_flush_and_put(struct io_ring_ctx *ctx)
+static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
{
if (!ctx)
return;
- if (ctx->submit_state.compl_nr) {
- mutex_lock(&ctx->uring_lock);
- io_submit_flush_completions(ctx);
+ if (*locked) {
+ if (ctx->submit_state.compl_nr)
+ io_submit_flush_completions(ctx);
mutex_unlock(&ctx->uring_lock);
+ *locked = false;
}
percpu_ref_put(&ctx->refs);
}
static void tctx_task_work(struct callback_head *cb)
{
+ bool locked = false;
struct io_ring_ctx *ctx = NULL;
struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
task_work);
@@ -2017,18 +2105,20 @@ static void tctx_task_work(struct callback_head *cb)
io_task_work.node);
if (req->ctx != ctx) {
- ctx_flush_and_put(ctx);
+ ctx_flush_and_put(ctx, &locked);
ctx = req->ctx;
+ /* if not contended, grab and improve batching */
+ locked = mutex_trylock(&ctx->uring_lock);
percpu_ref_get(&ctx->refs);
}
- req->io_task_work.func(req);
+ req->io_task_work.func(req, &locked);
node = next;
} while (node);
cond_resched();
}
- ctx_flush_and_put(ctx);
+ ctx_flush_and_put(ctx, &locked);
}
static void io_req_task_work_add(struct io_kiocb *req)
@@ -2080,27 +2170,25 @@ static void io_req_task_work_add(struct io_kiocb *req)
}
}
-static void io_req_task_cancel(struct io_kiocb *req)
+static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
{
struct io_ring_ctx *ctx = req->ctx;
- /* ctx is guaranteed to stay alive while we hold uring_lock */
- mutex_lock(&ctx->uring_lock);
+ /* not needed for normal modes, but SQPOLL depends on it */
+ io_tw_lock(ctx, locked);
io_req_complete_failed(req, req->result);
- mutex_unlock(&ctx->uring_lock);
}
-static void io_req_task_submit(struct io_kiocb *req)
+static void io_req_task_submit(struct io_kiocb *req, bool *locked)
{
struct io_ring_ctx *ctx = req->ctx;
- /* ctx stays valid until unlock, even if we drop all ours ctx->refs */
- mutex_lock(&ctx->uring_lock);
+ io_tw_lock(ctx, locked);
+ /* req->task == current here, checking PF_EXITING is safe */
if (likely(!(req->task->flags & PF_EXITING)))
__io_queue_sqe(req);
else
io_req_complete_failed(req, -EFAULT);
- mutex_unlock(&ctx->uring_lock);
}
static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
@@ -2136,6 +2224,11 @@ static void io_free_req(struct io_kiocb *req)
__io_free_req(req);
}
+static void io_free_req_work(struct io_kiocb *req, bool *locked)
+{
+ io_free_req(req);
+}
+
struct req_batch {
struct task_struct *task;
int task_refs;
@@ -2154,9 +2247,7 @@ static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
{
if (rb->ctx_refs)
percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
- if (rb->task == current)
- current->io_uring->cached_refs += rb->task_refs;
- else if (rb->task)
+ if (rb->task)
io_put_task(rb->task, rb->task_refs);
}
@@ -2182,7 +2273,7 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
}
static void io_submit_flush_completions(struct io_ring_ctx *ctx)
- __must_hold(&req->ctx->uring_lock)
+ __must_hold(&ctx->uring_lock)
{
struct io_submit_state *state = &ctx->submit_state;
int i, nr = state->compl_nr;
@@ -2235,7 +2326,7 @@ static inline void io_put_req(struct io_kiocb *req)
static inline void io_put_req_deferred(struct io_kiocb *req)
{
if (req_ref_put_and_test(req)) {
- req->io_task_work.func = io_free_req;
+ req->io_task_work.func = io_free_req_work;
io_req_task_work_add(req);
}
}
@@ -2270,6 +2361,8 @@ static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
{
struct io_buffer *kbuf;
+ if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+ return 0;
kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
return io_put_kbuf(req, kbuf);
}
@@ -2289,7 +2382,7 @@ static inline bool io_run_task_work(void)
* Find and free completed poll iocbs
*/
static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
- struct list_head *done, bool resubmit)
+ struct list_head *done)
{
struct req_batch rb;
struct io_kiocb *req;
@@ -2299,22 +2392,18 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
io_init_req_batch(&rb);
while (!list_empty(done)) {
- int cflags = 0;
-
req = list_first_entry(done, struct io_kiocb, inflight_entry);
list_del(&req->inflight_entry);
- if (READ_ONCE(req->result) == -EAGAIN && resubmit &&
+ if (READ_ONCE(req->result) == -EAGAIN &&
!(req->flags & REQ_F_DONT_REISSUE)) {
req->iopoll_completed = 0;
io_req_task_queue_reissue(req);
continue;
}
- if (req->flags & REQ_F_BUFFER_SELECTED)
- cflags = io_put_rw_kbuf(req);
-
- __io_cqring_fill_event(ctx, req->user_data, req->result, cflags);
+ __io_cqring_fill_event(ctx, req->user_data, req->result,
+ io_put_rw_kbuf(req));
(*nr_events)++;
if (req_ref_put_and_test(req))
@@ -2327,7 +2416,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
}
static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
- long min, bool resubmit)
+ long min)
{
struct io_kiocb *req, *tmp;
LIST_HEAD(done);
@@ -2367,7 +2456,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
}
if (!list_empty(&done))
- io_iopoll_complete(ctx, nr_events, &done, resubmit);
+ io_iopoll_complete(ctx, nr_events, &done);
return 0;
}
@@ -2385,7 +2474,7 @@ static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
while (!list_empty(&ctx->iopoll_list)) {
unsigned int nr_events = 0;
- io_do_iopoll(ctx, &nr_events, 0, false);
+ io_do_iopoll(ctx, &nr_events, 0);
/* let it sleep and repeat later if can't complete a request */
if (nr_events == 0)
@@ -2447,7 +2536,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
list_empty(&ctx->iopoll_list))
break;
}
- ret = io_do_iopoll(ctx, &nr_events, min, true);
+ ret = io_do_iopoll(ctx, &nr_events, min);
} while (!ret && nr_events < min && !need_resched());
out:
mutex_unlock(&ctx->uring_lock);
@@ -2532,13 +2621,22 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
return false;
}
-static void io_req_task_complete(struct io_kiocb *req)
+static void io_req_task_complete(struct io_kiocb *req, bool *locked)
{
- int cflags = 0;
+ unsigned int cflags = io_put_rw_kbuf(req);
+ long res = req->result;
- if (req->flags & REQ_F_BUFFER_SELECTED)
- cflags = io_put_rw_kbuf(req);
- __io_req_complete(req, 0, req->result, cflags);
+ if (*locked) {
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_submit_state *state = &ctx->submit_state;
+
+ io_req_complete_state(req, res, cflags);
+ state->compl_reqs[state->compl_nr++] = req;
+ if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
+ io_submit_flush_completions(ctx);
+ } else {
+ io_req_complete_post(req, res, cflags);
+ }
}
static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
@@ -2546,7 +2644,7 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
{
if (__io_complete_rw_common(req, res))
return;
- io_req_task_complete(req);
+ __io_req_complete(req, 0, req->result, io_put_rw_kbuf(req));
}
static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
@@ -2806,12 +2904,9 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
if (io_resubmit_prep(req)) {
io_req_task_queue_reissue(req);
} else {
- int cflags = 0;
-
req_set_fail(req);
- if (req->flags & REQ_F_BUFFER_SELECTED)
- cflags = io_put_rw_kbuf(req);
- __io_req_complete(req, issue_flags, ret, cflags);
+ __io_req_complete(req, issue_flags, ret,
+ io_put_rw_kbuf(req));
}
}
}
@@ -3493,7 +3588,7 @@ static int io_renameat_prep(struct io_kiocb *req,
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->ioprio || sqe->buf_index)
+ if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@@ -3544,7 +3639,8 @@ static int io_unlinkat_prep(struct io_kiocb *req,
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
+ if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
+ sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@@ -3590,8 +3686,8 @@ static int io_shutdown_prep(struct io_kiocb *req,
#if defined(CONFIG_NET)
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
- sqe->buf_index)
+ if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
+ sqe->buf_index || sqe->splice_fd_in))
return -EINVAL;
req->shutdown.how = READ_ONCE(sqe->len);
@@ -3739,7 +3835,8 @@ static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
+ if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
+ sqe->splice_fd_in))
return -EINVAL;
req->sync.flags = READ_ONCE(sqe->fsync_flags);
@@ -3772,7 +3869,8 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
static int io_fallocate_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
- if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
+ if (sqe->ioprio || sqe->buf_index || sqe->rw_flags ||
+ sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
@@ -3822,6 +3920,11 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
req->open.filename = NULL;
return ret;
}
+
+ req->open.file_slot = READ_ONCE(sqe->file_index);
+ if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC))
+ return -EINVAL;
+
req->open.nofile = rlimit(RLIMIT_NOFILE);
req->flags |= REQ_F_NEED_CLEANUP;
return 0;
@@ -3859,8 +3962,8 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
{
struct open_flags op;
struct file *file;
- bool nonblock_set;
- bool resolve_nonblock;
+ bool resolve_nonblock, nonblock_set;
+ bool fixed = !!req->open.file_slot;
int ret;
ret = build_open_flags(&req->open.how, &op);
@@ -3879,9 +3982,11 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
op.open_flag |= O_NONBLOCK;
}
- ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
- if (ret < 0)
- goto err;
+ if (!fixed) {
+ ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
+ if (ret < 0)
+ goto err;
+ }
file = do_filp_open(req->open.dfd, req->open.filename, &op);
if (IS_ERR(file)) {
@@ -3890,7 +3995,8 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
* marginal gain for something that is now known to be a slower
* path. So just put it, and we'll get a new one when we retry.
*/
- put_unused_fd(ret);
+ if (!fixed)
+ put_unused_fd(ret);
ret = PTR_ERR(file);
/* only retry if RESOLVE_CACHED wasn't already set by application */
@@ -3903,7 +4009,12 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
file->f_flags &= ~O_NONBLOCK;
fsnotify_open(file);
- fd_install(ret, file);
+
+ if (!fixed)
+ fd_install(ret, file);
+ else
+ ret = io_install_fixed_file(req, file, issue_flags,
+ req->open.file_slot - 1);
err:
putname(req->open.filename);
req->flags &= ~REQ_F_NEED_CLEANUP;
@@ -3924,7 +4035,8 @@ static int io_remove_buffers_prep(struct io_kiocb *req,
struct io_provide_buf *p = &req->pbuf;
u64 tmp;
- if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off)
+ if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
+ sqe->splice_fd_in)
return -EINVAL;
tmp = READ_ONCE(sqe->fd);
@@ -3995,7 +4107,7 @@ static int io_provide_buffers_prep(struct io_kiocb *req,
struct io_provide_buf *p = &req->pbuf;
u64 tmp;
- if (sqe->ioprio || sqe->rw_flags)
+ if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
return -EINVAL;
tmp = READ_ONCE(sqe->fd);
@@ -4082,7 +4194,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_EPOLL)
- if (sqe->ioprio || sqe->buf_index)
+ if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
@@ -4128,7 +4240,7 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
- if (sqe->ioprio || sqe->buf_index || sqe->off)
+ if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
@@ -4163,7 +4275,7 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- if (sqe->ioprio || sqe->buf_index || sqe->addr)
+ if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
@@ -4201,7 +4313,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->ioprio || sqe->buf_index)
+ if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
@@ -4237,7 +4349,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
- sqe->rw_flags || sqe->buf_index)
+ sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
@@ -4298,7 +4410,8 @@ static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
+ if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
+ sqe->splice_fd_in))
return -EINVAL;
req->sync.off = READ_ONCE(sqe->off);
@@ -4732,6 +4845,15 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
accept->flags = READ_ONCE(sqe->accept_flags);
accept->nofile = rlimit(RLIMIT_NOFILE);
+
+ accept->file_slot = READ_ONCE(sqe->file_index);
+ if (accept->file_slot && ((req->open.how.flags & O_CLOEXEC) ||
+ (accept->flags & SOCK_CLOEXEC)))
+ return -EINVAL;
+ if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
+ return -EINVAL;
+ if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
+ accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
return 0;
}
@@ -4740,20 +4862,35 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
struct io_accept *accept = &req->accept;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
- int ret;
+ bool fixed = !!accept->file_slot;
+ struct file *file;
+ int ret, fd;
if (req->file->f_flags & O_NONBLOCK)
req->flags |= REQ_F_NOWAIT;
- ret = __sys_accept4_file(req->file, file_flags, accept->addr,
- accept->addr_len, accept->flags,
- accept->nofile);
- if (ret == -EAGAIN && force_nonblock)
- return -EAGAIN;
- if (ret < 0) {
+ if (!fixed) {
+ fd = __get_unused_fd_flags(accept->flags, accept->nofile);
+ if (unlikely(fd < 0))
+ return fd;
+ }
+ file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
+ accept->flags);
+ if (IS_ERR(file)) {
+ if (!fixed)
+ put_unused_fd(fd);
+ ret = PTR_ERR(file);
+ if (ret == -EAGAIN && force_nonblock)
+ return -EAGAIN;
if (ret == -ERESTARTSYS)
ret = -EINTR;
req_set_fail(req);
+ } else if (!fixed) {
+ fd_install(fd, file);
+ ret = fd;
+ } else {
+ ret = io_install_fixed_file(req, file, issue_flags,
+ accept->file_slot - 1);
}
__io_req_complete(req, issue_flags, ret, 0);
return 0;
@@ -4773,7 +4910,8 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
+ if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags ||
+ sqe->splice_fd_in)
return -EINVAL;
conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
@@ -4886,6 +5024,7 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
{
struct io_ring_ctx *ctx = req->ctx;
+ /* req->task == current here, checking PF_EXITING is safe */
if (unlikely(req->task->flags & PF_EXITING))
WRITE_ONCE(poll->canceled, true);
@@ -4964,7 +5103,7 @@ static bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
return !(flags & IORING_CQE_F_MORE);
}
-static void io_poll_task_func(struct io_kiocb *req)
+static void io_poll_task_func(struct io_kiocb *req, bool *locked)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *nxt;
@@ -4988,7 +5127,7 @@ static void io_poll_task_func(struct io_kiocb *req)
if (done) {
nxt = io_put_req_find_next(req);
if (nxt)
- io_req_task_submit(nxt);
+ io_req_task_submit(nxt, locked);
}
}
}
@@ -5055,8 +5194,13 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
if (unlikely(pt->nr_entries)) {
struct io_poll_iocb *poll_one = poll;
+ /* double add on the same waitqueue head, ignore */
+ if (poll_one->head == head)
+ return;
/* already have a 2nd entry, fail a third attempt */
if (*poll_ptr) {
+ if ((*poll_ptr)->head == head)
+ return;
pt->error = -EINVAL;
return;
}
@@ -5066,9 +5210,6 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
*/
if (!(poll_one->events & EPOLLONESHOT))
poll_one->events |= EPOLLONESHOT;
- /* double add on the same waitqueue head, ignore */
- if (poll_one->head == head)
- return;
poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
if (!poll) {
pt->error = -ENOMEM;
@@ -5098,7 +5239,7 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
__io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
}
-static void io_async_task_func(struct io_kiocb *req)
+static void io_async_task_func(struct io_kiocb *req, bool *locked)
{
struct async_poll *apoll = req->apoll;
struct io_ring_ctx *ctx = req->ctx;
@@ -5115,7 +5256,7 @@ static void io_async_task_func(struct io_kiocb *req)
spin_unlock(&ctx->completion_lock);
if (!READ_ONCE(apoll->poll.canceled))
- io_req_task_submit(req);
+ io_req_task_submit(req, locked);
else
io_req_complete_failed(req, -ECANCELED);
}
@@ -5233,17 +5374,14 @@ static int io_arm_poll_handler(struct io_kiocb *req)
req->apoll = apoll;
req->flags |= REQ_F_POLLED;
ipt.pt._qproc = io_async_queue_proc;
- io_req_refcount(req);
+ io_req_set_refcount(req);
ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
io_async_wake);
- if (ret || ipt.error) {
- spin_unlock(&ctx->completion_lock);
- if (ret)
- return IO_APOLL_READY;
- return IO_APOLL_ABORTED;
- }
spin_unlock(&ctx->completion_lock);
+ if (ret || ipt.error)
+ return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
+
trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data,
mask, apoll->poll.events);
return IO_APOLL_OK;
@@ -5369,7 +5507,7 @@ static int io_poll_update_prep(struct io_kiocb *req,
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->ioprio || sqe->buf_index)
+ if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
flags = READ_ONCE(sqe->len);
if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
@@ -5424,7 +5562,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
if (flags & ~IORING_POLL_ADD_MULTI)
return -EINVAL;
- io_req_refcount(req);
+ io_req_set_refcount(req);
poll->events = io_poll_parse_events(sqe, flags);
return 0;
}
@@ -5517,18 +5655,10 @@ err:
return 0;
}
-static void io_req_task_timeout(struct io_kiocb *req)
+static void io_req_task_timeout(struct io_kiocb *req, bool *locked)
{
- struct io_ring_ctx *ctx = req->ctx;
-
- spin_lock(&ctx->completion_lock);
- io_cqring_fill_event(ctx, req->user_data, -ETIME, 0);
- io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
-
- io_cqring_ev_posted(ctx);
req_set_fail(req);
- io_put_req(req);
+ io_req_complete_post(req, -ETIME, 0);
}
static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
@@ -5574,6 +5704,7 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
}
static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
+ __must_hold(&ctx->completion_lock)
__must_hold(&ctx->timeout_lock)
{
struct io_kiocb *req = io_timeout_extract(ctx, user_data);
@@ -5587,6 +5718,47 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
return 0;
}
+static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
+{
+ switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) {
+ case IORING_TIMEOUT_BOOTTIME:
+ return CLOCK_BOOTTIME;
+ case IORING_TIMEOUT_REALTIME:
+ return CLOCK_REALTIME;
+ default:
+ /* can't happen, vetted at prep time */
+ WARN_ON_ONCE(1);
+ fallthrough;
+ case 0:
+ return CLOCK_MONOTONIC;
+ }
+}
+
+static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
+ struct timespec64 *ts, enum hrtimer_mode mode)
+ __must_hold(&ctx->timeout_lock)
+{
+ struct io_timeout_data *io;
+ struct io_kiocb *req;
+ bool found = false;
+
+ list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) {
+ found = user_data == req->user_data;
+ if (found)
+ break;
+ }
+ if (!found)
+ return -ENOENT;
+
+ io = req->async_data;
+ if (hrtimer_try_to_cancel(&io->timer) == -1)
+ return -EALREADY;
+ hrtimer_init(&io->timer, io_timeout_get_clock(io), mode);
+ io->timer.function = io_link_timeout_fn;
+ hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
+ return 0;
+}
+
static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
struct timespec64 *ts, enum hrtimer_mode mode)
__must_hold(&ctx->timeout_lock)
@@ -5600,7 +5772,7 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
req->timeout.off = 0; /* noseq */
data = req->async_data;
list_add_tail(&req->timeout.list, &ctx->timeout_list);
- hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode);
+ hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
data->timer.function = io_timeout_fn;
hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
return 0;
@@ -5615,13 +5787,18 @@ static int io_timeout_remove_prep(struct io_kiocb *req,
return -EINVAL;
if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->len)
+ if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in)
return -EINVAL;
+ tr->ltimeout = false;
tr->addr = READ_ONCE(sqe->addr);
tr->flags = READ_ONCE(sqe->timeout_flags);
- if (tr->flags & IORING_TIMEOUT_UPDATE) {
- if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS))
+ if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) {
+ if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
+ return -EINVAL;
+ if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
+ tr->ltimeout = true;
+ if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS))
return -EINVAL;
if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
return -EFAULT;
@@ -5648,22 +5825,26 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
struct io_ring_ctx *ctx = req->ctx;
int ret;
- spin_lock_irq(&ctx->timeout_lock);
- if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE))
+ if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) {
+ spin_lock(&ctx->completion_lock);
+ spin_lock_irq(&ctx->timeout_lock);
ret = io_timeout_cancel(ctx, tr->addr);
- else
- ret = io_timeout_update(ctx, tr->addr, &tr->ts,
- io_translate_timeout_mode(tr->flags));
- spin_unlock_irq(&ctx->timeout_lock);
+ spin_unlock_irq(&ctx->timeout_lock);
+ spin_unlock(&ctx->completion_lock);
+ } else {
+ enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);
+
+ spin_lock_irq(&ctx->timeout_lock);
+ if (tr->ltimeout)
+ ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
+ else
+ ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
+ spin_unlock_irq(&ctx->timeout_lock);
+ }
- spin_lock(&ctx->completion_lock);
- io_cqring_fill_event(ctx, req->user_data, ret, 0);
- io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- io_cqring_ev_posted(ctx);
if (ret < 0)
req_set_fail(req);
- io_put_req(req);
+ io_req_complete_post(req, ret, 0);
return 0;
}
@@ -5676,14 +5857,19 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
+ if (sqe->ioprio || sqe->buf_index || sqe->len != 1 ||
+ sqe->splice_fd_in)
return -EINVAL;
if (off && is_timeout_link)
return -EINVAL;
flags = READ_ONCE(sqe->timeout_flags);
- if (flags & ~IORING_TIMEOUT_ABS)
+ if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK))
+ return -EINVAL;
+ /* more than one clock specified is invalid, obviously */
+ if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
return -EINVAL;
+ INIT_LIST_HEAD(&req->timeout.list);
req->timeout.off = off;
if (unlikely(off && !req->ctx->off_timeout_used))
req->ctx->off_timeout_used = true;
@@ -5693,14 +5879,24 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
data = req->async_data;
data->req = req;
+ data->flags = flags;
if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
return -EFAULT;
data->mode = io_translate_timeout_mode(flags);
- hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
- if (is_timeout_link)
- io_req_track_inflight(req);
+ hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
+
+ if (is_timeout_link) {
+ struct io_submit_link *link = &req->ctx->submit_state.link;
+
+ if (!link->head)
+ return -EINVAL;
+ if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
+ return -EINVAL;
+ req->timeout.head = link->last;
+ link->last->flags |= REQ_F_ARM_LTIMEOUT;
+ }
return 0;
}
@@ -5793,32 +5989,27 @@ static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
return ret;
}
-static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
- struct io_kiocb *req, __u64 sqe_addr,
- int success_ret)
+static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
{
+ struct io_ring_ctx *ctx = req->ctx;
int ret;
+ WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
+
ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
- spin_lock(&ctx->completion_lock);
if (ret != -ENOENT)
- goto done;
+ return ret;
+
+ spin_lock(&ctx->completion_lock);
spin_lock_irq(&ctx->timeout_lock);
ret = io_timeout_cancel(ctx, sqe_addr);
spin_unlock_irq(&ctx->timeout_lock);
if (ret != -ENOENT)
- goto done;
+ goto out;
ret = io_poll_cancel(ctx, sqe_addr, false);
-done:
- if (!ret)
- ret = success_ret;
- io_cqring_fill_event(ctx, req->user_data, ret, 0);
- io_commit_cqring(ctx);
+out:
spin_unlock(&ctx->completion_lock);
- io_cqring_ev_posted(ctx);
-
- if (ret < 0)
- req_set_fail(req);
+ return ret;
}
static int io_async_cancel_prep(struct io_kiocb *req,
@@ -5828,7 +6019,8 @@ static int io_async_cancel_prep(struct io_kiocb *req,
return -EINVAL;
if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
return -EINVAL;
- if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags)
+ if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags ||
+ sqe->splice_fd_in)
return -EINVAL;
req->cancel.addr = READ_ONCE(sqe->addr);
@@ -5842,20 +6034,9 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
struct io_tctx_node *node;
int ret;
- /* tasks should wait for their io-wq threads, so safe w/o sync */
- ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
- spin_lock(&ctx->completion_lock);
- if (ret != -ENOENT)
- goto done;
- spin_lock_irq(&ctx->timeout_lock);
- ret = io_timeout_cancel(ctx, sqe_addr);
- spin_unlock_irq(&ctx->timeout_lock);
+ ret = io_try_cancel_userdata(req, sqe_addr);
if (ret != -ENOENT)
goto done;
- ret = io_poll_cancel(ctx, sqe_addr, false);
- if (ret != -ENOENT)
- goto done;
- spin_unlock(&ctx->completion_lock);
/* slow path, try all io-wq's */
io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
@@ -5868,17 +6049,10 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
break;
}
io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
-
- spin_lock(&ctx->completion_lock);
done:
- io_cqring_fill_event(ctx, req->user_data, ret, 0);
- io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- io_cqring_ev_posted(ctx);
-
if (ret < 0)
req_set_fail(req);
- io_put_req(req);
+ io_req_complete_post(req, ret, 0);
return 0;
}
@@ -5887,7 +6061,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req,
{
if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
return -EINVAL;
- if (sqe->ioprio || sqe->rw_flags)
+ if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
return -EINVAL;
req->rsrc_update.offset = READ_ONCE(sqe->off);
@@ -6093,7 +6267,7 @@ fail:
if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
spin_unlock(&ctx->completion_lock);
kfree(de);
- io_queue_async_work(req);
+ io_queue_async_work(req, NULL);
return true;
}
@@ -6316,14 +6490,17 @@ static void io_wq_submit_work(struct io_wq_work *work)
struct io_kiocb *timeout;
int ret = 0;
- io_req_refcount(req);
- /* will be dropped by ->io_free_work() after returning to io-wq */
- req_ref_get(req);
+ /* one will be dropped by ->io_free_work() after returning to io-wq */
+ if (!(req->flags & REQ_F_REFCOUNT))
+ __io_req_set_refcount(req, 2);
+ else
+ req_ref_get(req);
timeout = io_prep_linked_timeout(req);
if (timeout)
io_queue_linked_timeout(timeout);
+ /* either cancelled or io-wq is dying, so don't touch tctx->iowq */
if (work->flags & IO_WQ_WORK_CANCEL)
ret = -ECANCELED;
@@ -6413,15 +6590,15 @@ static inline struct file *io_file_get(struct io_ring_ctx *ctx,
return io_file_get_normal(ctx, req, fd);
}
-static void io_req_task_link_timeout(struct io_kiocb *req)
+static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
{
struct io_kiocb *prev = req->timeout.prev;
- struct io_ring_ctx *ctx = req->ctx;
+ int ret;
if (prev) {
- io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
+ ret = io_try_cancel_userdata(req, prev->user_data);
+ io_req_complete_post(req, ret ?: -ETIME, 0);
io_put_req(prev);
- io_put_req(req);
} else {
io_req_complete_post(req, -ETIME, 0);
}
@@ -6448,6 +6625,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
if (!req_ref_inc_not_zero(prev))
prev = NULL;
}
+ list_del(&req->timeout.list);
req->timeout.prev = prev;
spin_unlock_irqrestore(&ctx->timeout_lock, flags);
@@ -6471,6 +6649,7 @@ static void io_queue_linked_timeout(struct io_kiocb *req)
data->timer.function = io_link_timeout_fn;
hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
data->mode);
+ list_add_tail(&req->timeout.list, &ctx->ltimeout_list);
}
spin_unlock_irq(&ctx->timeout_lock);
/* drop submission reference */
@@ -6480,7 +6659,7 @@ static void io_queue_linked_timeout(struct io_kiocb *req)
static void __io_queue_sqe(struct io_kiocb *req)
__must_hold(&req->ctx->uring_lock)
{
- struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
+ struct io_kiocb *linked_timeout;
int ret;
issue_sqe:
@@ -6498,24 +6677,34 @@ issue_sqe:
state->compl_reqs[state->compl_nr++] = req;
if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
io_submit_flush_completions(ctx);
+ return;
}
+
+ linked_timeout = io_prep_linked_timeout(req);
+ if (linked_timeout)
+ io_queue_linked_timeout(linked_timeout);
} else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
+ linked_timeout = io_prep_linked_timeout(req);
+
switch (io_arm_poll_handler(req)) {
case IO_APOLL_READY:
+ if (linked_timeout)
+ io_unprep_linked_timeout(req);
goto issue_sqe;
case IO_APOLL_ABORTED:
/*
* Queued up for async execution, worker will release
* submit reference when the iocb is actually submitted.
*/
- io_queue_async_work(req);
+ io_queue_async_work(req, NULL);
break;
}
+
+ if (linked_timeout)
+ io_queue_linked_timeout(linked_timeout);
} else {
io_req_complete_failed(req, ret);
}
- if (linked_timeout)
- io_queue_linked_timeout(linked_timeout);
}
static inline void io_queue_sqe(struct io_kiocb *req)
@@ -6524,15 +6713,17 @@ static inline void io_queue_sqe(struct io_kiocb *req)
if (unlikely(req->ctx->drain_active) && io_drain_req(req))
return;
- if (likely(!(req->flags & REQ_F_FORCE_ASYNC))) {
+ if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) {
__io_queue_sqe(req);
+ } else if (req->flags & REQ_F_FAIL) {
+ io_req_complete_failed(req, req->result);
} else {
int ret = io_req_prep_async(req);
if (unlikely(ret))
io_req_complete_failed(req, ret);
else
- io_queue_async_work(req);
+ io_queue_async_work(req, NULL);
}
}
@@ -6634,20 +6825,34 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
ret = io_init_req(ctx, req, sqe);
if (unlikely(ret)) {
fail_req:
+ /* fail even hard links since we don't submit */
if (link->head) {
- /* fail even hard links since we don't submit */
- req_set_fail(link->head);
- io_req_complete_failed(link->head, -ECANCELED);
- link->head = NULL;
+ /*
+ * we can judge a link req is failed or cancelled by if
+ * REQ_F_FAIL is set, but the head is an exception since
+ * it may be set REQ_F_FAIL because of other req's failure
+ * so let's leverage req->result to distinguish if a head
+ * is set REQ_F_FAIL because of its failure or other req's
+ * failure so that we can set the correct ret code for it.
+ * init result here to avoid affecting the normal path.
+ */
+ if (!(link->head->flags & REQ_F_FAIL))
+ req_fail_link_node(link->head, -ECANCELED);
+ } else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
+ /*
+ * the current req is a normal req, we should return
+ * error and thus break the submittion loop.
+ */
+ io_req_complete_failed(req, ret);
+ return ret;
}
- io_req_complete_failed(req, ret);
- return ret;
+ req_fail_link_node(req, ret);
+ } else {
+ ret = io_req_prep(req, sqe);
+ if (unlikely(ret))
+ goto fail_req;
}
- ret = io_req_prep(req, sqe);
- if (unlikely(ret))
- goto fail_req;
-
/* don't need @sqe from now on */
trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data,
req->flags, true,
@@ -6663,9 +6868,14 @@ fail_req:
if (link->head) {
struct io_kiocb *head = link->head;
- ret = io_req_prep_async(req);
- if (unlikely(ret))
- goto fail_req;
+ if (!(req->flags & REQ_F_FAIL)) {
+ ret = io_req_prep_async(req);
+ if (unlikely(ret)) {
+ req_fail_link_node(req, ret);
+ if (!(head->flags & REQ_F_FAIL))
+ req_fail_link_node(head, -ECANCELED);
+ }
+ }
trace_io_uring_link(ctx, req, head);
link->last->link = req;
link->last = req;
@@ -6760,25 +6970,15 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
__must_hold(&ctx->uring_lock)
{
- struct io_uring_task *tctx;
int submitted = 0;
/* make sure SQ entry isn't read before tail */
nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
if (!percpu_ref_tryget_many(&ctx->refs, nr))
return -EAGAIN;
+ io_get_task_refs(nr);
- tctx = current->io_uring;
- tctx->cached_refs -= nr;
- if (unlikely(tctx->cached_refs < 0)) {
- unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
-
- percpu_counter_add(&tctx->inflight, refill);
- refcount_add(refill, &current->usage);
- tctx->cached_refs += refill;
- }
io_submit_state_start(&ctx->submit_state, nr);
-
while (submitted < nr) {
const struct io_uring_sqe *sqe;
struct io_kiocb *req;
@@ -6791,7 +6991,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
}
sqe = io_get_sqe(ctx);
if (unlikely(!sqe)) {
- kmem_cache_free(req_cachep, req);
+ list_add(&req->inflight_entry, &ctx->submit_state.free_list);
break;
}
/* will complete beyond this point, count as submitted */
@@ -6856,7 +7056,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
mutex_lock(&ctx->uring_lock);
if (!list_empty(&ctx->iopoll_list))
- io_do_iopoll(ctx, &nr_events, 0, true);
+ io_do_iopoll(ctx, &nr_events, 0);
/*
* Don't submit if refs are dying, good for io_uring_register(),
@@ -7136,14 +7336,14 @@ static void **io_alloc_page_table(size_t size)
size_t init_size = size;
void **table;
- table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL);
+ table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
if (!table)
return NULL;
for (i = 0; i < nr_tables; i++) {
unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
- table[i] = kzalloc(this_size, GFP_KERNEL);
+ table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
if (!table[i]) {
io_free_page_table(table, init_size);
return NULL;
@@ -7334,7 +7534,8 @@ fail:
static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
{
- table->files = kvcalloc(nr_files, sizeof(table->files[0]), GFP_KERNEL);
+ table->files = kvcalloc(nr_files, sizeof(table->files[0]),
+ GFP_KERNEL_ACCOUNT);
return !!table->files;
}
@@ -7731,6 +7932,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return -EINVAL;
if (nr_args > IORING_MAX_FIXED_FILES)
return -EMFILE;
+ if (nr_args > rlimit(RLIMIT_NOFILE))
+ return -EMFILE;
ret = io_rsrc_node_switch_start(ctx);
if (ret)
return ret;
@@ -7840,6 +8043,46 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
#endif
}
+static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
+ unsigned int issue_flags, u32 slot_index)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ struct io_fixed_file *file_slot;
+ int ret = -EBADF;
+
+ io_ring_submit_lock(ctx, !force_nonblock);
+ if (file->f_op == &io_uring_fops)
+ goto err;
+ ret = -ENXIO;
+ if (!ctx->file_data)
+ goto err;
+ ret = -EINVAL;
+ if (slot_index >= ctx->nr_user_files)
+ goto err;
+
+ slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
+ file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
+ ret = -EBADF;
+ if (file_slot->file_ptr)
+ goto err;
+
+ *io_get_tag_slot(ctx->file_data, slot_index) = 0;
+ io_fixed_file_set(file_slot, file);
+ ret = io_sqe_file_register(ctx, file, slot_index);
+ if (ret) {
+ file_slot->file_ptr = 0;
+ goto err;
+ }
+
+ ret = 0;
+err:
+ io_ring_submit_unlock(ctx, !force_nonblock);
+ if (ret)
+ fput(file);
+ return ret;
+}
+
static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
struct io_rsrc_node *node, void *rsrc)
{
@@ -8699,6 +8942,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
sock_release(ctx->ring_sock);
}
#endif
+ WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
io_mem_free(ctx->rings);
io_mem_free(ctx->sq_sqes);
@@ -9126,8 +9370,8 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx)
* Must be after io_uring_del_task_file() (removes nodes under
* uring_lock) to avoid race with io_uring_try_cancel_iowq().
*/
- tctx->io_wq = NULL;
io_wq_put_and_exit(wq);
+ tctx->io_wq = NULL;
}
}
@@ -9213,9 +9457,9 @@ static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
}
}
-void __io_uring_cancel(struct files_struct *files)
+void __io_uring_cancel(bool cancel_all)
{
- io_uring_cancel_generic(!files, NULL);
+ io_uring_cancel_generic(cancel_all, NULL);
}
static void *io_uring_validate_mmap_request(struct file *file,
@@ -10053,6 +10297,31 @@ static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
return io_wq_cpu_affinity(tctx->io_wq, NULL);
}
+static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
+ void __user *arg)
+{
+ struct io_uring_task *tctx = current->io_uring;
+ __u32 new_count[2];
+ int i, ret;
+
+ if (!tctx || !tctx->io_wq)
+ return -EINVAL;
+ if (copy_from_user(new_count, arg, sizeof(new_count)))
+ return -EFAULT;
+ for (i = 0; i < ARRAY_SIZE(new_count); i++)
+ if (new_count[i] > INT_MAX)
+ return -EINVAL;
+
+ ret = io_wq_max_workers(tctx->io_wq, new_count);
+ if (ret)
+ return ret;
+
+ if (copy_to_user(arg, new_count, sizeof(new_count)))
+ return -EFAULT;
+
+ return 0;
+}
+
static bool io_register_op_must_quiesce(int op)
{
switch (op) {
@@ -10070,6 +10339,7 @@ static bool io_register_op_must_quiesce(int op)
case IORING_REGISTER_BUFFERS_UPDATE:
case IORING_REGISTER_IOWQ_AFF:
case IORING_UNREGISTER_IOWQ_AFF:
+ case IORING_REGISTER_IOWQ_MAX_WORKERS:
return false;
default:
return true;
@@ -10226,6 +10496,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_unregister_iowq_aff(ctx);
break;
+ case IORING_REGISTER_IOWQ_MAX_WORKERS:
+ ret = -EINVAL;
+ if (!arg || nr_args != 2)
+ break;
+ ret = io_register_iowq_max_workers(ctx, arg);
+ break;
default:
ret = -EINVAL;
break;
@@ -10307,11 +10583,16 @@ static int __init io_uring_init(void)
BUILD_BUG_SQE_ELEM(40, __u16, buf_group);
BUILD_BUG_SQE_ELEM(42, __u16, personality);
BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
+ BUILD_BUG_SQE_ELEM(44, __u32, file_index);
BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
sizeof(struct io_uring_rsrc_update));
BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
sizeof(struct io_uring_rsrc_update2));
+
+ /* ->buf_index is u16 */
+ BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
+
/* should fit into one byte */
BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 1e2204fa9963..eea8267ae1f2 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -263,209 +263,6 @@ static long ioctl_file_clone_range(struct file *file,
args.src_length, args.dest_offset);
}
-#ifdef CONFIG_BLOCK
-
-static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
-{
- return (offset >> inode->i_blkbits);
-}
-
-static inline loff_t blk_to_logical(struct inode *inode, sector_t blk)
-{
- return (blk << inode->i_blkbits);
-}
-
-/**
- * __generic_block_fiemap - FIEMAP for block based inodes (no locking)
- * @inode: the inode to map
- * @fieinfo: the fiemap info struct that will be passed back to userspace
- * @start: where to start mapping in the inode
- * @len: how much space to map
- * @get_block: the fs's get_block function
- *
- * This does FIEMAP for block based inodes. Basically it will just loop
- * through get_block until we hit the number of extents we want to map, or we
- * go past the end of the file and hit a hole.
- *
- * If it is possible to have data blocks beyond a hole past @inode->i_size, then
- * please do not use this function, it will stop at the first unmapped block
- * beyond i_size.
- *
- * If you use this function directly, you need to do your own locking. Use
- * generic_block_fiemap if you want the locking done for you.
- */
-static int __generic_block_fiemap(struct inode *inode,
- struct fiemap_extent_info *fieinfo, loff_t start,
- loff_t len, get_block_t *get_block)
-{
- struct buffer_head map_bh;
- sector_t start_blk, last_blk;
- loff_t isize = i_size_read(inode);
- u64 logical = 0, phys = 0, size = 0;
- u32 flags = FIEMAP_EXTENT_MERGED;
- bool past_eof = false, whole_file = false;
- int ret = 0;
-
- ret = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_SYNC);
- if (ret)
- return ret;
-
- /*
- * Either the i_mutex or other appropriate locking needs to be held
- * since we expect isize to not change at all through the duration of
- * this call.
- */
- if (len >= isize) {
- whole_file = true;
- len = isize;
- }
-
- /*
- * Some filesystems can't deal with being asked to map less than
- * blocksize, so make sure our len is at least block length.
- */
- if (logical_to_blk(inode, len) == 0)
- len = blk_to_logical(inode, 1);
-
- start_blk = logical_to_blk(inode, start);
- last_blk = logical_to_blk(inode, start + len - 1);
-
- do {
- /*
- * we set b_size to the total size we want so it will map as
- * many contiguous blocks as possible at once
- */
- memset(&map_bh, 0, sizeof(struct buffer_head));
- map_bh.b_size = len;
-
- ret = get_block(inode, start_blk, &map_bh, 0);
- if (ret)
- break;
-
- /* HOLE */
- if (!buffer_mapped(&map_bh)) {
- start_blk++;
-
- /*
- * We want to handle the case where there is an
- * allocated block at the front of the file, and then
- * nothing but holes up to the end of the file properly,
- * to make sure that extent at the front gets properly
- * marked with FIEMAP_EXTENT_LAST
- */
- if (!past_eof &&
- blk_to_logical(inode, start_blk) >= isize)
- past_eof = 1;
-
- /*
- * First hole after going past the EOF, this is our
- * last extent
- */
- if (past_eof && size) {
- flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST;
- ret = fiemap_fill_next_extent(fieinfo, logical,
- phys, size,
- flags);
- } else if (size) {
- ret = fiemap_fill_next_extent(fieinfo, logical,
- phys, size, flags);
- size = 0;
- }
-
- /* if we have holes up to/past EOF then we're done */
- if (start_blk > last_blk || past_eof || ret)
- break;
- } else {
- /*
- * We have gone over the length of what we wanted to
- * map, and it wasn't the entire file, so add the extent
- * we got last time and exit.
- *
- * This is for the case where say we want to map all the
- * way up to the second to the last block in a file, but
- * the last block is a hole, making the second to last
- * block FIEMAP_EXTENT_LAST. In this case we want to
- * see if there is a hole after the second to last block
- * so we can mark it properly. If we found data after
- * we exceeded the length we were requesting, then we
- * are good to go, just add the extent to the fieinfo
- * and break
- */
- if (start_blk > last_blk && !whole_file) {
- ret = fiemap_fill_next_extent(fieinfo, logical,
- phys, size,
- flags);
- break;
- }
-
- /*
- * if size != 0 then we know we already have an extent
- * to add, so add it.
- */
- if (size) {
- ret = fiemap_fill_next_extent(fieinfo, logical,
- phys, size,
- flags);
- if (ret)
- break;
- }
-
- logical = blk_to_logical(inode, start_blk);
- phys = blk_to_logical(inode, map_bh.b_blocknr);
- size = map_bh.b_size;
- flags = FIEMAP_EXTENT_MERGED;
-
- start_blk += logical_to_blk(inode, size);
-
- /*
- * If we are past the EOF, then we need to make sure as
- * soon as we find a hole that the last extent we found
- * is marked with FIEMAP_EXTENT_LAST
- */
- if (!past_eof && logical + size >= isize)
- past_eof = true;
- }
- cond_resched();
- if (fatal_signal_pending(current)) {
- ret = -EINTR;
- break;
- }
-
- } while (1);
-
- /* If ret is 1 then we just hit the end of the extent array */
- if (ret == 1)
- ret = 0;
-
- return ret;
-}
-
-/**
- * generic_block_fiemap - FIEMAP for block based inodes
- * @inode: The inode to map
- * @fieinfo: The mapping information
- * @start: The initial block to map
- * @len: The length of the extect to attempt to map
- * @get_block: The block mapping function for the fs
- *
- * Calls __generic_block_fiemap to map the inode, after taking
- * the inode's mutex lock.
- */
-
-int generic_block_fiemap(struct inode *inode,
- struct fiemap_extent_info *fieinfo, u64 start,
- u64 len, get_block_t *get_block)
-{
- int ret;
- inode_lock(inode);
- ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block);
- inode_unlock(inode);
- return ret;
-}
-EXPORT_SYMBOL(generic_block_fiemap);
-
-#endif /* CONFIG_BLOCK */
-
/*
* This provides compatibility with legacy XFS pre-allocation ioctls
* which predate the fallocate syscall.
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 21edc423b79f..678e2c51b855 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -155,7 +155,6 @@ struct iso9660_options{
unsigned int overriderockperm:1;
unsigned int uid_set:1;
unsigned int gid_set:1;
- unsigned int utf8:1;
unsigned char map;
unsigned char check;
unsigned int blocksize;
@@ -356,7 +355,6 @@ static int parse_options(char *options, struct iso9660_options *popt)
popt->gid = GLOBAL_ROOT_GID;
popt->uid = GLOBAL_ROOT_UID;
popt->iocharset = NULL;
- popt->utf8 = 0;
popt->overriderockperm = 0;
popt->session=-1;
popt->sbsector=-1;
@@ -389,10 +387,13 @@ static int parse_options(char *options, struct iso9660_options *popt)
case Opt_cruft:
popt->cruft = 1;
break;
+#ifdef CONFIG_JOLIET
case Opt_utf8:
- popt->utf8 = 1;
+ kfree(popt->iocharset);
+ popt->iocharset = kstrdup("utf8", GFP_KERNEL);
+ if (!popt->iocharset)
+ return 0;
break;
-#ifdef CONFIG_JOLIET
case Opt_iocharset:
kfree(popt->iocharset);
popt->iocharset = match_strdup(&args[0]);
@@ -495,7 +496,6 @@ static int isofs_show_options(struct seq_file *m, struct dentry *root)
if (sbi->s_nocompress) seq_puts(m, ",nocompress");
if (sbi->s_overriderockperm) seq_puts(m, ",overriderockperm");
if (sbi->s_showassoc) seq_puts(m, ",showassoc");
- if (sbi->s_utf8) seq_puts(m, ",utf8");
if (sbi->s_check) seq_printf(m, ",check=%c", sbi->s_check);
if (sbi->s_mapping) seq_printf(m, ",map=%c", sbi->s_mapping);
@@ -518,9 +518,10 @@ static int isofs_show_options(struct seq_file *m, struct dentry *root)
seq_printf(m, ",fmode=%o", sbi->s_fmode);
#ifdef CONFIG_JOLIET
- if (sbi->s_nls_iocharset &&
- strcmp(sbi->s_nls_iocharset->charset, CONFIG_NLS_DEFAULT) != 0)
+ if (sbi->s_nls_iocharset)
seq_printf(m, ",iocharset=%s", sbi->s_nls_iocharset->charset);
+ else
+ seq_puts(m, ",iocharset=utf8");
#endif
return 0;
}
@@ -863,14 +864,13 @@ root_found:
sbi->s_nls_iocharset = NULL;
#ifdef CONFIG_JOLIET
- if (joliet_level && opt.utf8 == 0) {
+ if (joliet_level) {
char *p = opt.iocharset ? opt.iocharset : CONFIG_NLS_DEFAULT;
- sbi->s_nls_iocharset = load_nls(p);
- if (! sbi->s_nls_iocharset) {
- /* Fail only if explicit charset specified */
- if (opt.iocharset)
+ if (strcmp(p, "utf8") != 0) {
+ sbi->s_nls_iocharset = opt.iocharset ?
+ load_nls(opt.iocharset) : load_nls_default();
+ if (!sbi->s_nls_iocharset)
goto out_freesbi;
- sbi->s_nls_iocharset = load_nls_default();
}
}
#endif
@@ -886,7 +886,6 @@ root_found:
sbi->s_gid = opt.gid;
sbi->s_uid_set = opt.uid_set;
sbi->s_gid_set = opt.gid_set;
- sbi->s_utf8 = opt.utf8;
sbi->s_nocompress = opt.nocompress;
sbi->s_overriderockperm = opt.overriderockperm;
/*
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index 055ec6c586f7..dcdc191ed183 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -44,7 +44,6 @@ struct isofs_sb_info {
unsigned char s_session;
unsigned int s_high_sierra:1;
unsigned int s_rock:2;
- unsigned int s_utf8:1;
unsigned int s_cruft:1; /* Broken disks with high byte of length
* containing junk */
unsigned int s_nocompress:1;
diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c
index be8b6a9d0b92..c0f04a1e7f69 100644
--- a/fs/isofs/joliet.c
+++ b/fs/isofs/joliet.c
@@ -41,14 +41,12 @@ uni16_to_x8(unsigned char *ascii, __be16 *uni, int len, struct nls_table *nls)
int
get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, struct inode * inode)
{
- unsigned char utf8;
struct nls_table *nls;
unsigned char len = 0;
- utf8 = ISOFS_SB(inode->i_sb)->s_utf8;
nls = ISOFS_SB(inode->i_sb)->s_nls_iocharset;
- if (utf8) {
+ if (!nls) {
len = utf16s_to_utf8s((const wchar_t *) de->name,
de->name_len[0] >> 1, UTF16_BIG_ENDIAN,
outname, PAGE_SIZE);
diff --git a/fs/locks.c b/fs/locks.c
index 74b2a1dfe8d8..3d6fb4ae847b 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1397,103 +1397,6 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
return error;
}
-#ifdef CONFIG_MANDATORY_FILE_LOCKING
-/**
- * locks_mandatory_locked - Check for an active lock
- * @file: the file to check
- *
- * Searches the inode's list of locks to find any POSIX locks which conflict.
- * This function is called from locks_verify_locked() only.
- */
-int locks_mandatory_locked(struct file *file)
-{
- int ret;
- struct inode *inode = locks_inode(file);
- struct file_lock_context *ctx;
- struct file_lock *fl;
-
- ctx = smp_load_acquire(&inode->i_flctx);
- if (!ctx || list_empty_careful(&ctx->flc_posix))
- return 0;
-
- /*
- * Search the lock list for this inode for any POSIX locks.
- */
- spin_lock(&ctx->flc_lock);
- ret = 0;
- list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
- if (fl->fl_owner != current->files &&
- fl->fl_owner != file) {
- ret = -EAGAIN;
- break;
- }
- }
- spin_unlock(&ctx->flc_lock);
- return ret;
-}
-
-/**
- * locks_mandatory_area - Check for a conflicting lock
- * @inode: the file to check
- * @filp: how the file was opened (if it was)
- * @start: first byte in the file to check
- * @end: lastbyte in the file to check
- * @type: %F_WRLCK for a write lock, else %F_RDLCK
- *
- * Searches the inode's list of locks to find any POSIX locks which conflict.
- */
-int locks_mandatory_area(struct inode *inode, struct file *filp, loff_t start,
- loff_t end, unsigned char type)
-{
- struct file_lock fl;
- int error;
- bool sleep = false;
-
- locks_init_lock(&fl);
- fl.fl_pid = current->tgid;
- fl.fl_file = filp;
- fl.fl_flags = FL_POSIX | FL_ACCESS;
- if (filp && !(filp->f_flags & O_NONBLOCK))
- sleep = true;
- fl.fl_type = type;
- fl.fl_start = start;
- fl.fl_end = end;
-
- for (;;) {
- if (filp) {
- fl.fl_owner = filp;
- fl.fl_flags &= ~FL_SLEEP;
- error = posix_lock_inode(inode, &fl, NULL);
- if (!error)
- break;
- }
-
- if (sleep)
- fl.fl_flags |= FL_SLEEP;
- fl.fl_owner = current->files;
- error = posix_lock_inode(inode, &fl, NULL);
- if (error != FILE_LOCK_DEFERRED)
- break;
- error = wait_event_interruptible(fl.fl_wait,
- list_empty(&fl.fl_blocked_member));
- if (!error) {
- /*
- * If we've been sleeping someone might have
- * changed the permissions behind our back.
- */
- if (__mandatory_lock(inode))
- continue;
- }
-
- break;
- }
- locks_delete_block(&fl);
-
- return error;
-}
-EXPORT_SYMBOL(locks_mandatory_area);
-#endif /* CONFIG_MANDATORY_FILE_LOCKING */
-
static void lease_clear_pending(struct file_lock *fl, int arg)
{
switch (arg) {
@@ -2486,14 +2389,6 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
if (file_lock == NULL)
return -ENOLCK;
- /* Don't allow mandatory locks on files that may be memory mapped
- * and shared.
- */
- if (mandatory_lock(inode) && mapping_writably_mapped(filp->f_mapping)) {
- error = -EAGAIN;
- goto out;
- }
-
error = flock_to_posix_lock(filp, file_lock, flock);
if (error)
goto out;
@@ -2611,21 +2506,12 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
struct flock64 *flock)
{
struct file_lock *file_lock = locks_alloc_lock();
- struct inode *inode = locks_inode(filp);
struct file *f;
int error;
if (file_lock == NULL)
return -ENOLCK;
- /* Don't allow mandatory locks on files that may be memory mapped
- * and shared.
- */
- if (mandatory_lock(inode) && mapping_writably_mapped(filp->f_mapping)) {
- error = -EAGAIN;
- goto out;
- }
-
error = flock64_to_posix_lock(filp, file_lock, flock);
if (error)
goto out;
@@ -2857,8 +2743,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
seq_puts(f, "POSIX ");
seq_printf(f, " %s ",
- (inode == NULL) ? "*NOINODE*" :
- mandatory_lock(inode) ? "MANDATORY" : "ADVISORY ");
+ (inode == NULL) ? "*NOINODE*" : "ADVISORY ");
} else if (IS_FLOCK(fl)) {
if (fl->fl_type & LOCK_MAND) {
seq_puts(f, "FLOCK MSNFS ");
diff --git a/fs/namei.c b/fs/namei.c
index bf6d8a738c59..471eb9fead6e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3023,9 +3023,7 @@ static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp)
/*
* Refuse to truncate files with mandatory locks held on them.
*/
- error = locks_verify_locked(filp);
- if (!error)
- error = security_path_truncate(path);
+ error = security_path_truncate(path);
if (!error) {
error = do_truncate(mnt_userns, path->dentry, 0,
ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
diff --git a/fs/namespace.c b/fs/namespace.c
index 97adcb5ab5d5..20caa4b4c539 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1715,22 +1715,14 @@ static inline bool may_mount(void)
return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
}
-#ifdef CONFIG_MANDATORY_FILE_LOCKING
-static bool may_mandlock(void)
+static void warn_mandlock(void)
{
- pr_warn_once("======================================================\n"
- "WARNING: the mand mount option is being deprecated and\n"
- " will be removed in v5.15!\n"
- "======================================================\n");
- return capable(CAP_SYS_ADMIN);
+ pr_warn_once("=======================================================\n"
+ "WARNING: The mand mount option has been deprecated and\n"
+ " and is ignored by this kernel. Remove the mand\n"
+ " option from the mount to silence this warning.\n"
+ "=======================================================\n");
}
-#else
-static inline bool may_mandlock(void)
-{
- pr_warn("VFS: \"mand\" mount option not supported");
- return false;
-}
-#endif
static int can_umount(const struct path *path, int flags)
{
@@ -3197,8 +3189,8 @@ int path_mount(const char *dev_name, struct path *path,
return ret;
if (!may_mount())
return -EPERM;
- if ((flags & SB_MANDLOCK) && !may_mandlock())
- return -EPERM;
+ if (flags & SB_MANDLOCK)
+ warn_mandlock();
/* Default to relatime unless overriden */
if (!(flags & MS_NOATIME))
@@ -3581,9 +3573,8 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
if (fc->phase != FS_CONTEXT_AWAITING_MOUNT)
goto err_unlock;
- ret = -EPERM;
- if ((fc->sb_flags & SB_MANDLOCK) && !may_mandlock())
- goto err_unlock;
+ if (fc->sb_flags & SB_MANDLOCK)
+ warn_mandlock();
newmount.mnt = vfs_create_mount(fc);
if (IS_ERR(newmount.mnt)) {
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 1fef107961bc..514be5d28d70 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -806,10 +806,6 @@ int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
nfs_inc_stats(inode, NFSIOS_VFSLOCK);
- /* No mandatory locks over NFS */
- if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
- goto out_err;
-
if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL)
is_local = 1;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index fa67ecd5fe63..8313e1dbb5dc 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -5735,16 +5735,6 @@ check_special_stateids(struct net *net, svc_fh *current_fh, stateid_t *stateid,
NFS4_SHARE_DENY_READ);
}
-/*
- * Allow READ/WRITE during grace period on recovered state only for files
- * that are not able to provide mandatory locking.
- */
-static inline int
-grace_disallows_io(struct net *net, struct inode *inode)
-{
- return opens_in_grace(net) && mandatory_lock(inode);
-}
-
static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session)
{
/*
@@ -6026,7 +6016,6 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
stateid_t *stateid, int flags, struct nfsd_file **nfp,
struct nfs4_stid **cstid)
{
- struct inode *ino = d_inode(fhp->fh_dentry);
struct net *net = SVC_NET(rqstp);
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
struct nfs4_stid *s = NULL;
@@ -6035,9 +6024,6 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
if (nfp)
*nfp = NULL;
- if (grace_disallows_io(net, ino))
- return nfserr_grace;
-
if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) {
status = check_special_stateids(net, fhp, stateid, flags);
goto done;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a224a5e23cc1..92e77f92268a 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -333,7 +333,6 @@ nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct iattr *iap)
{
struct inode *inode = d_inode(fhp->fh_dentry);
- int host_err;
if (iap->ia_size < inode->i_size) {
__be32 err;
@@ -343,20 +342,7 @@ nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (err)
return err;
}
-
- host_err = get_write_access(inode);
- if (host_err)
- goto out_nfserrno;
-
- host_err = locks_verify_truncate(inode, NULL, iap->ia_size);
- if (host_err)
- goto out_put_write_access;
- return 0;
-
-out_put_write_access:
- put_write_access(inode);
-out_nfserrno:
- return nfserrno(host_err);
+ return nfserrno(get_write_access(inode));
}
/*
@@ -750,13 +736,6 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
err = nfserr_perm;
if (IS_APPEND(inode) && (may_flags & NFSD_MAY_WRITE))
goto out;
- /*
- * We must ignore files (but only files) which might have mandatory
- * locks on them because there is no way to know if the accesser has
- * the lock.
- */
- if (S_ISREG((inode)->i_mode) && mandatory_lock(inode))
- goto out;
if (!inode->i_fop)
goto out;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 4abd928b0bc8..f6b2d280aab5 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1053,7 +1053,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_time_gran = 1;
sb->s_max_links = NILFS_LINK_MAX;
- sb->s_bdi = bdi_get(sb->s_bdev->bd_bdi);
+ sb->s_bdi = bdi_get(sb->s_bdev->bd_disk->bdi);
err = load_nilfs(nilfs, sb);
if (err)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 28b67cb9458d..6facdf476255 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/fanotify.h>
#include <linux/fcntl.h>
+#include <linux/fdtable.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/anon_inodes.h>
@@ -109,8 +110,10 @@ struct kmem_cache *fanotify_path_event_cachep __read_mostly;
struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
#define FANOTIFY_EVENT_ALIGN 4
-#define FANOTIFY_INFO_HDR_LEN \
+#define FANOTIFY_FID_INFO_HDR_LEN \
(sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
+#define FANOTIFY_PIDFD_INFO_HDR_LEN \
+ sizeof(struct fanotify_event_info_pidfd)
static int fanotify_fid_info_len(int fh_len, int name_len)
{
@@ -119,10 +122,11 @@ static int fanotify_fid_info_len(int fh_len, int name_len)
if (name_len)
info_len += name_len + 1;
- return roundup(FANOTIFY_INFO_HDR_LEN + info_len, FANOTIFY_EVENT_ALIGN);
+ return roundup(FANOTIFY_FID_INFO_HDR_LEN + info_len,
+ FANOTIFY_EVENT_ALIGN);
}
-static int fanotify_event_info_len(unsigned int fid_mode,
+static int fanotify_event_info_len(unsigned int info_mode,
struct fanotify_event *event)
{
struct fanotify_info *info = fanotify_event_info(event);
@@ -133,7 +137,8 @@ static int fanotify_event_info_len(unsigned int fid_mode,
if (dir_fh_len) {
info_len += fanotify_fid_info_len(dir_fh_len, info->name_len);
- } else if ((fid_mode & FAN_REPORT_NAME) && (event->mask & FAN_ONDIR)) {
+ } else if ((info_mode & FAN_REPORT_NAME) &&
+ (event->mask & FAN_ONDIR)) {
/*
* With group flag FAN_REPORT_NAME, if name was not recorded in
* event on a directory, we will report the name ".".
@@ -141,6 +146,9 @@ static int fanotify_event_info_len(unsigned int fid_mode,
dot_len = 1;
}
+ if (info_mode & FAN_REPORT_PIDFD)
+ info_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
+
if (fh_len)
info_len += fanotify_fid_info_len(fh_len, dot_len);
@@ -176,7 +184,7 @@ static struct fanotify_event *get_one_event(struct fsnotify_group *group,
size_t event_size = FAN_EVENT_METADATA_LEN;
struct fanotify_event *event = NULL;
struct fsnotify_event *fsn_event;
- unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
+ unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
@@ -186,8 +194,8 @@ static struct fanotify_event *get_one_event(struct fsnotify_group *group,
goto out;
event = FANOTIFY_E(fsn_event);
- if (fid_mode)
- event_size += fanotify_event_info_len(fid_mode, event);
+ if (info_mode)
+ event_size += fanotify_event_info_len(info_mode, event);
if (event_size > count) {
event = ERR_PTR(-EINVAL);
@@ -308,9 +316,10 @@ static int process_access_response(struct fsnotify_group *group,
return -ENOENT;
}
-static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
- int info_type, const char *name, size_t name_len,
- char __user *buf, size_t count)
+static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
+ int info_type, const char *name,
+ size_t name_len,
+ char __user *buf, size_t count)
{
struct fanotify_event_info_fid info = { };
struct file_handle handle = { };
@@ -403,6 +412,117 @@ static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
return info_len;
}
+static int copy_pidfd_info_to_user(int pidfd,
+ char __user *buf,
+ size_t count)
+{
+ struct fanotify_event_info_pidfd info = { };
+ size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN;
+
+ if (WARN_ON_ONCE(info_len > count))
+ return -EFAULT;
+
+ info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD;
+ info.hdr.len = info_len;
+ info.pidfd = pidfd;
+
+ if (copy_to_user(buf, &info, info_len))
+ return -EFAULT;
+
+ return info_len;
+}
+
+static int copy_info_records_to_user(struct fanotify_event *event,
+ struct fanotify_info *info,
+ unsigned int info_mode, int pidfd,
+ char __user *buf, size_t count)
+{
+ int ret, total_bytes = 0, info_type = 0;
+ unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS;
+ unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
+
+ /*
+ * Event info records order is as follows: dir fid + name, child fid.
+ */
+ if (fanotify_event_dir_fh_len(event)) {
+ info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
+ FAN_EVENT_INFO_TYPE_DFID;
+ ret = copy_fid_info_to_user(fanotify_event_fsid(event),
+ fanotify_info_dir_fh(info),
+ info_type,
+ fanotify_info_name(info),
+ info->name_len, buf, count);
+ if (ret < 0)
+ return ret;
+
+ buf += ret;
+ count -= ret;
+ total_bytes += ret;
+ }
+
+ if (fanotify_event_object_fh_len(event)) {
+ const char *dot = NULL;
+ int dot_len = 0;
+
+ if (fid_mode == FAN_REPORT_FID || info_type) {
+ /*
+ * With only group flag FAN_REPORT_FID only type FID is
+ * reported. Second info record type is always FID.
+ */
+ info_type = FAN_EVENT_INFO_TYPE_FID;
+ } else if ((fid_mode & FAN_REPORT_NAME) &&
+ (event->mask & FAN_ONDIR)) {
+ /*
+ * With group flag FAN_REPORT_NAME, if name was not
+ * recorded in an event on a directory, report the name
+ * "." with info type DFID_NAME.
+ */
+ info_type = FAN_EVENT_INFO_TYPE_DFID_NAME;
+ dot = ".";
+ dot_len = 1;
+ } else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) ||
+ (event->mask & FAN_ONDIR)) {
+ /*
+ * With group flag FAN_REPORT_DIR_FID, a single info
+ * record has type DFID for directory entry modification
+ * event and for event on a directory.
+ */
+ info_type = FAN_EVENT_INFO_TYPE_DFID;
+ } else {
+ /*
+ * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID,
+ * a single info record has type FID for event on a
+ * non-directory, when there is no directory to report.
+ * For example, on FAN_DELETE_SELF event.
+ */
+ info_type = FAN_EVENT_INFO_TYPE_FID;
+ }
+
+ ret = copy_fid_info_to_user(fanotify_event_fsid(event),
+ fanotify_event_object_fh(event),
+ info_type, dot, dot_len,
+ buf, count);
+ if (ret < 0)
+ return ret;
+
+ buf += ret;
+ count -= ret;
+ total_bytes += ret;
+ }
+
+ if (pidfd_mode) {
+ ret = copy_pidfd_info_to_user(pidfd, buf, count);
+ if (ret < 0)
+ return ret;
+
+ buf += ret;
+ count -= ret;
+ total_bytes += ret;
+ }
+
+ return total_bytes;
+}
+
static ssize_t copy_event_to_user(struct fsnotify_group *group,
struct fanotify_event *event,
char __user *buf, size_t count)
@@ -410,15 +530,15 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
struct fanotify_event_metadata metadata;
struct path *path = fanotify_event_path(event);
struct fanotify_info *info = fanotify_event_info(event);
- unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
+ unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
+ unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
struct file *f = NULL;
- int ret, fd = FAN_NOFD;
- int info_type = 0;
+ int ret, pidfd = FAN_NOPIDFD, fd = FAN_NOFD;
pr_debug("%s: group=%p event=%p\n", __func__, group, event);
metadata.event_len = FAN_EVENT_METADATA_LEN +
- fanotify_event_info_len(fid_mode, event);
+ fanotify_event_info_len(info_mode, event);
metadata.metadata_len = FAN_EVENT_METADATA_LEN;
metadata.vers = FANOTIFY_METADATA_VERSION;
metadata.reserved = 0;
@@ -447,6 +567,33 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
}
metadata.fd = fd;
+ if (pidfd_mode) {
+ /*
+ * Complain if the FAN_REPORT_PIDFD and FAN_REPORT_TID mutual
+ * exclusion is ever lifted. At the time of incoporating pidfd
+ * support within fanotify, the pidfd API only supported the
+ * creation of pidfds for thread-group leaders.
+ */
+ WARN_ON_ONCE(FAN_GROUP_FLAG(group, FAN_REPORT_TID));
+
+ /*
+ * The PIDTYPE_TGID check for an event->pid is performed
+ * preemptively in an attempt to catch out cases where the event
+ * listener reads events after the event generating process has
+ * already terminated. Report FAN_NOPIDFD to the event listener
+ * in those cases, with all other pidfd creation errors being
+ * reported as FAN_EPIDFD.
+ */
+ if (metadata.pid == 0 ||
+ !pid_has_task(event->pid, PIDTYPE_TGID)) {
+ pidfd = FAN_NOPIDFD;
+ } else {
+ pidfd = pidfd_create(event->pid, 0);
+ if (pidfd < 0)
+ pidfd = FAN_EPIDFD;
+ }
+ }
+
ret = -EFAULT;
/*
* Sanity check copy size in case get_one_event() and
@@ -467,67 +614,11 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
if (f)
fd_install(fd, f);
- /* Event info records order is: dir fid + name, child fid */
- if (fanotify_event_dir_fh_len(event)) {
- info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
- FAN_EVENT_INFO_TYPE_DFID;
- ret = copy_info_to_user(fanotify_event_fsid(event),
- fanotify_info_dir_fh(info),
- info_type, fanotify_info_name(info),
- info->name_len, buf, count);
+ if (info_mode) {
+ ret = copy_info_records_to_user(event, info, info_mode, pidfd,
+ buf, count);
if (ret < 0)
goto out_close_fd;
-
- buf += ret;
- count -= ret;
- }
-
- if (fanotify_event_object_fh_len(event)) {
- const char *dot = NULL;
- int dot_len = 0;
-
- if (fid_mode == FAN_REPORT_FID || info_type) {
- /*
- * With only group flag FAN_REPORT_FID only type FID is
- * reported. Second info record type is always FID.
- */
- info_type = FAN_EVENT_INFO_TYPE_FID;
- } else if ((fid_mode & FAN_REPORT_NAME) &&
- (event->mask & FAN_ONDIR)) {
- /*
- * With group flag FAN_REPORT_NAME, if name was not
- * recorded in an event on a directory, report the
- * name "." with info type DFID_NAME.
- */
- info_type = FAN_EVENT_INFO_TYPE_DFID_NAME;
- dot = ".";
- dot_len = 1;
- } else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) ||
- (event->mask & FAN_ONDIR)) {
- /*
- * With group flag FAN_REPORT_DIR_FID, a single info
- * record has type DFID for directory entry modification
- * event and for event on a directory.
- */
- info_type = FAN_EVENT_INFO_TYPE_DFID;
- } else {
- /*
- * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID,
- * a single info record has type FID for event on a
- * non-directory, when there is no directory to report.
- * For example, on FAN_DELETE_SELF event.
- */
- info_type = FAN_EVENT_INFO_TYPE_FID;
- }
-
- ret = copy_info_to_user(fanotify_event_fsid(event),
- fanotify_event_object_fh(event),
- info_type, dot, dot_len, buf, count);
- if (ret < 0)
- goto out_close_fd;
-
- buf += ret;
- count -= ret;
}
return metadata.event_len;
@@ -537,6 +628,10 @@ out_close_fd:
put_unused_fd(fd);
fput(f);
}
+
+ if (pidfd >= 0)
+ close_fd(pidfd);
+
return ret;
}
@@ -1082,6 +1177,14 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
#endif
return -EINVAL;
+ /*
+ * A pidfd can only be returned for a thread-group leader; thus
+ * FAN_REPORT_PIDFD and FAN_REPORT_TID need to remain mutually
+ * exclusive.
+ */
+ if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID))
+ return -EINVAL;
+
if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
return -EINVAL;
@@ -1483,7 +1586,7 @@ static int __init fanotify_user_setup(void)
FANOTIFY_DEFAULT_MAX_USER_MARKS);
BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
- BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10);
+ BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 11);
BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 30d422b8c0fc..963e6ce75b96 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -87,15 +87,15 @@ static void fsnotify_unmount_inodes(struct super_block *sb)
if (iput_inode)
iput(iput_inode);
- /* Wait for outstanding inode references from connectors */
- wait_var_event(&sb->s_fsnotify_inode_refs,
- !atomic_long_read(&sb->s_fsnotify_inode_refs));
}
void fsnotify_sb_delete(struct super_block *sb)
{
fsnotify_unmount_inodes(sb);
fsnotify_clear_marks_by_sb(sb);
+ /* Wait for outstanding object references from connectors */
+ wait_var_event(&sb->s_fsnotify_connectors,
+ !atomic_long_read(&sb->s_fsnotify_connectors));
}
/*
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index ff2063ec6b0f..87d8a50ee803 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -27,6 +27,21 @@ static inline struct super_block *fsnotify_conn_sb(
return container_of(conn->obj, struct super_block, s_fsnotify_marks);
}
+static inline struct super_block *fsnotify_connector_sb(
+ struct fsnotify_mark_connector *conn)
+{
+ switch (conn->type) {
+ case FSNOTIFY_OBJ_TYPE_INODE:
+ return fsnotify_conn_inode(conn)->i_sb;
+ case FSNOTIFY_OBJ_TYPE_VFSMOUNT:
+ return fsnotify_conn_mount(conn)->mnt.mnt_sb;
+ case FSNOTIFY_OBJ_TYPE_SB:
+ return fsnotify_conn_sb(conn);
+ default:
+ return NULL;
+ }
+}
+
/* destroy all events sitting in this groups notification queue */
extern void fsnotify_flush_notify(struct fsnotify_group *group);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index d32ab349db74..95006d1d29ab 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -169,6 +169,37 @@ static void fsnotify_connector_destroy_workfn(struct work_struct *work)
}
}
+static void fsnotify_get_inode_ref(struct inode *inode)
+{
+ ihold(inode);
+ atomic_long_inc(&inode->i_sb->s_fsnotify_connectors);
+}
+
+static void fsnotify_put_inode_ref(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ iput(inode);
+ if (atomic_long_dec_and_test(&sb->s_fsnotify_connectors))
+ wake_up_var(&sb->s_fsnotify_connectors);
+}
+
+static void fsnotify_get_sb_connectors(struct fsnotify_mark_connector *conn)
+{
+ struct super_block *sb = fsnotify_connector_sb(conn);
+
+ if (sb)
+ atomic_long_inc(&sb->s_fsnotify_connectors);
+}
+
+static void fsnotify_put_sb_connectors(struct fsnotify_mark_connector *conn)
+{
+ struct super_block *sb = fsnotify_connector_sb(conn);
+
+ if (sb && atomic_long_dec_and_test(&sb->s_fsnotify_connectors))
+ wake_up_var(&sb->s_fsnotify_connectors);
+}
+
static void *fsnotify_detach_connector_from_object(
struct fsnotify_mark_connector *conn,
unsigned int *type)
@@ -182,13 +213,13 @@ static void *fsnotify_detach_connector_from_object(
if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) {
inode = fsnotify_conn_inode(conn);
inode->i_fsnotify_mask = 0;
- atomic_long_inc(&inode->i_sb->s_fsnotify_inode_refs);
} else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0;
} else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) {
fsnotify_conn_sb(conn)->s_fsnotify_mask = 0;
}
+ fsnotify_put_sb_connectors(conn);
rcu_assign_pointer(*(conn->obj), NULL);
conn->obj = NULL;
conn->type = FSNOTIFY_OBJ_TYPE_DETACHED;
@@ -209,19 +240,12 @@ static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark)
/* Drop object reference originally held by a connector */
static void fsnotify_drop_object(unsigned int type, void *objp)
{
- struct inode *inode;
- struct super_block *sb;
-
if (!objp)
return;
/* Currently only inode references are passed to be dropped */
if (WARN_ON_ONCE(type != FSNOTIFY_OBJ_TYPE_INODE))
return;
- inode = objp;
- sb = inode->i_sb;
- iput(inode);
- if (atomic_long_dec_and_test(&sb->s_fsnotify_inode_refs))
- wake_up_var(&sb->s_fsnotify_inode_refs);
+ fsnotify_put_inode_ref(objp);
}
void fsnotify_put_mark(struct fsnotify_mark *mark)
@@ -493,8 +517,12 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
conn->fsid.val[0] = conn->fsid.val[1] = 0;
conn->flags = 0;
}
- if (conn->type == FSNOTIFY_OBJ_TYPE_INODE)
- inode = igrab(fsnotify_conn_inode(conn));
+ if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) {
+ inode = fsnotify_conn_inode(conn);
+ fsnotify_get_inode_ref(inode);
+ }
+ fsnotify_get_sb_connectors(conn);
+
/*
* cmpxchg() provides the barrier so that readers of *connp can see
* only initialized structure
@@ -502,7 +530,7 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
if (cmpxchg(connp, NULL, conn)) {
/* Someone else created list structure for us */
if (inode)
- iput(inode);
+ fsnotify_put_inode_ref(inode);
kmem_cache_free(fsnotify_mark_connector_cachep, conn);
}
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index fab7c6a4a7d0..73a3854b2afb 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -101,8 +101,6 @@ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
if (!(fl->fl_flags & FL_FLOCK))
return -ENOLCK;
- if (__mandatory_lock(inode))
- return -ENOLCK;
if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
ocfs2_mount_local(osb))
@@ -121,8 +119,6 @@ int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
if (!(fl->fl_flags & FL_POSIX))
return -ENOLCK;
- if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
- return -ENOLCK;
return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
}
diff --git a/fs/open.c b/fs/open.c
index 94bef26ff1b6..daa324606a41 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -105,9 +105,7 @@ long vfs_truncate(const struct path *path, loff_t length)
if (error)
goto put_write_and_out;
- error = locks_verify_truncate(inode, NULL, length);
- if (!error)
- error = security_path_truncate(path);
+ error = security_path_truncate(path);
if (!error)
error = do_truncate(mnt_userns, path->dentry, length, 0, NULL);
@@ -189,9 +187,7 @@ long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
if (IS_APPEND(file_inode(f.file)))
goto out_putf;
sb_start_write(inode->i_sb);
- error = locks_verify_truncate(inode, f.file, length);
- if (!error)
- error = security_path_truncate(&f.file->f_path);
+ error = security_path_truncate(&f.file->f_path);
if (!error)
error = do_truncate(file_mnt_user_ns(f.file), dentry, length,
ATTR_MTIME | ATTR_CTIME, f.file);
diff --git a/fs/pipe.c b/fs/pipe.c
index 678dee2a8228..6d4342bad9f1 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -363,10 +363,9 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
* _very_ unlikely case that the pipe was full, but we got
* no data.
*/
- if (unlikely(was_full)) {
+ if (unlikely(was_full))
wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
- kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
- }
+ kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
/*
* But because we didn't read anything, at this point we can
@@ -385,12 +384,11 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
wake_next_reader = false;
__pipe_unlock(pipe);
- if (was_full) {
+ if (was_full)
wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
- kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
- }
if (wake_next_reader)
wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
+ kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
if (ret > 0)
file_accessed(filp);
return ret;
@@ -565,10 +563,9 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
* become empty while we dropped the lock.
*/
__pipe_unlock(pipe);
- if (was_empty) {
+ if (was_empty)
wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
- kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
- }
+ kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
__pipe_lock(pipe);
was_empty = pipe_empty(pipe->head, pipe->tail);
@@ -591,10 +588,9 @@ out:
* Epoll nonsensically wants a wakeup whether the pipe
* was already empty or not.
*/
- if (was_empty || pipe->poll_usage) {
+ if (was_empty || pipe->poll_usage)
wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
- kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
- }
+ kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
if (wake_next_writer)
wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
diff --git a/fs/read_write.c b/fs/read_write.c
index 9db7adf160d2..af057c57bdc6 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -365,12 +365,8 @@ out_putf:
int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
{
- struct inode *inode;
- int retval = -EINVAL;
-
- inode = file_inode(file);
if (unlikely((ssize_t) count < 0))
- return retval;
+ return -EINVAL;
/*
* ranged mandatory locking does not apply to streams - it makes sense
@@ -381,19 +377,12 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t
if (unlikely(pos < 0)) {
if (!unsigned_offsets(file))
- return retval;
+ return -EINVAL;
if (count >= -pos) /* both values are in 0..LLONG_MAX */
return -EOVERFLOW;
} else if (unlikely((loff_t) (pos + count) < 0)) {
if (!unsigned_offsets(file))
- return retval;
- }
-
- if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
- retval = locks_mandatory_area(inode, file, pos, pos + count - 1,
- read_write == READ ? F_RDLCK : F_WRLCK);
- if (retval < 0)
- return retval;
+ return -EINVAL;
}
}
diff --git a/fs/remap_range.c b/fs/remap_range.c
index e4a5fdd7ad7b..6d4a9beaa097 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -99,24 +99,12 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in,
static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
bool write)
{
- struct inode *inode = file_inode(file);
-
if (unlikely(pos < 0 || len < 0))
return -EINVAL;
if (unlikely((loff_t) (pos + len) < 0))
return -EINVAL;
- if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
- loff_t end = len ? pos + len - 1 : OFFSET_MAX;
- int retval;
-
- retval = locks_mandatory_area(inode, file, pos, end,
- write ? F_WRLCK : F_RDLCK);
- if (retval < 0)
- return retval;
- }
-
return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
}
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 855f0e87066d..2db8bcf7ff85 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -49,8 +49,7 @@ static int copy_bio_to_actor(struct bio *bio,
bytes_to_copy = min_t(int, bytes_to_copy,
req_length - copied_bytes);
- memcpy(actor_addr + actor_offset,
- page_address(bvec->bv_page) + bvec->bv_offset + offset,
+ memcpy(actor_addr + actor_offset, bvec_virt(bvec) + offset,
bytes_to_copy);
actor_offset += bytes_to_copy;
@@ -177,7 +176,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length,
goto out_free_bio;
}
/* Extract the length of the metadata block */
- data = page_address(bvec->bv_page) + bvec->bv_offset;
+ data = bvec_virt(bvec);
length = data[offset];
if (offset < bvec->bv_len - 1) {
length |= data[offset + 1] << 8;
@@ -186,7 +185,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length,
res = -EIO;
goto out_free_bio;
}
- data = page_address(bvec->bv_page) + bvec->bv_offset;
+ data = bvec_virt(bvec);
length |= data[0] << 8;
}
bio_free_pages(bio);
diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c
index 233d5582fbee..b685b6238316 100644
--- a/fs/squashfs/lz4_wrapper.c
+++ b/fs/squashfs/lz4_wrapper.c
@@ -101,7 +101,7 @@ static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm,
while (bio_next_segment(bio, &iter_all)) {
int avail = min(bytes, ((int)bvec->bv_len) - offset);
- data = page_address(bvec->bv_page) + bvec->bv_offset;
+ data = bvec_virt(bvec);
memcpy(buff, data + offset, avail);
buff += avail;
bytes -= avail;
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index 97bb7d92ddcd..cb510a631968 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -76,7 +76,7 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm,
while (bio_next_segment(bio, &iter_all)) {
int avail = min(bytes, ((int)bvec->bv_len) - offset);
- data = page_address(bvec->bv_page) + bvec->bv_offset;
+ data = bvec_virt(bvec);
memcpy(buff, data + offset, avail);
buff += avail;
bytes -= avail;
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
index e80419aed862..68f6d09bb3a2 100644
--- a/fs/squashfs/xz_wrapper.c
+++ b/fs/squashfs/xz_wrapper.c
@@ -146,7 +146,7 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
}
avail = min(length, ((int)bvec->bv_len) - offset);
- data = page_address(bvec->bv_page) + bvec->bv_offset;
+ data = bvec_virt(bvec);
length -= avail;
stream->buf.in = data + offset;
stream->buf.in_size = avail;
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index bcb881ec47f2..a20e9042146b 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -76,7 +76,7 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
}
avail = min(length, ((int)bvec->bv_len) - offset);
- data = page_address(bvec->bv_page) + bvec->bv_offset;
+ data = bvec_virt(bvec);
length -= avail;
stream->next_in = data + offset;
stream->avail_in = avail;
diff --git a/fs/squashfs/zstd_wrapper.c b/fs/squashfs/zstd_wrapper.c
index b7cb1faa652d..0015cf8b5582 100644
--- a/fs/squashfs/zstd_wrapper.c
+++ b/fs/squashfs/zstd_wrapper.c
@@ -94,7 +94,7 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm,
}
avail = min(length, ((int)bvec->bv_len) - offset);
- data = page_address(bvec->bv_page) + bvec->bv_offset;
+ data = bvec_virt(bvec);
length -= avail;
in_buf.src = data + offset;
in_buf.size = avail;
diff --git a/fs/super.c b/fs/super.c
index 91b7f156735b..bcef3a6f4c4b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1203,7 +1203,7 @@ static int set_bdev_super(struct super_block *s, void *data)
{
s->s_bdev = data;
s->s_dev = s->s_bdev->bd_dev;
- s->s_bdi = bdi_get(s->s_bdev->bd_bdi);
+ s->s_bdi = bdi_get(s->s_bdev->bd_disk->bdi);
if (blk_queue_stable_writes(s->s_bdev->bd_disk->queue))
s->s_iflags |= SB_I_STABLE_WRITES;
diff --git a/fs/timerfd.c b/fs/timerfd.c
index c5509d2448e3..e9c96a0c79f1 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -115,6 +115,22 @@ void timerfd_clock_was_set(void)
rcu_read_unlock();
}
+static void timerfd_resume_work(struct work_struct *work)
+{
+ timerfd_clock_was_set();
+}
+
+static DECLARE_WORK(timerfd_work, timerfd_resume_work);
+
+/*
+ * Invoked from timekeeping_resume(). Defer the actual update to work so
+ * timerfd_clock_was_set() runs in task context.
+ */
+void timerfd_resume(void)
+{
+ schedule_work(&timerfd_work);
+}
+
static void __timerfd_remove_cancel(struct timerfd_ctx *ctx)
{
if (ctx->might_cancel) {
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index c19dba45aa20..70abdfad2df1 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -35,7 +35,6 @@
#include "udf_i.h"
#include "udf_sb.h"
-
static int udf_readdir(struct file *file, struct dir_context *ctx)
{
struct inode *dir = file_inode(file);
@@ -135,7 +134,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
lfi = cfi.lengthFileIdent;
if (fibh.sbh == fibh.ebh) {
- nameptr = fi->fileIdent + liu;
+ nameptr = udf_get_fi_ident(fi);
} else {
int poffset; /* Unpaded ending offset */
@@ -153,7 +152,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
}
}
nameptr = copy_name;
- memcpy(nameptr, fi->fileIdent + liu,
+ memcpy(nameptr, udf_get_fi_ident(fi),
lfi - poffset);
memcpy(nameptr + lfi - poffset,
fibh.ebh->b_data, poffset);
diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h
index 185c3e247648..de17a97e8667 100644
--- a/fs/udf/ecma_167.h
+++ b/fs/udf/ecma_167.h
@@ -307,14 +307,14 @@ struct logicalVolDesc {
struct regid impIdent;
uint8_t impUse[128];
struct extent_ad integritySeqExt;
- uint8_t partitionMaps[0];
+ uint8_t partitionMaps[];
} __packed;
/* Generic Partition Map (ECMA 167r3 3/10.7.1) */
struct genericPartitionMap {
uint8_t partitionMapType;
uint8_t partitionMapLength;
- uint8_t partitionMapping[0];
+ uint8_t partitionMapping[];
} __packed;
/* Partition Map Type (ECMA 167r3 3/10.7.1.1) */
@@ -342,7 +342,7 @@ struct unallocSpaceDesc {
struct tag descTag;
__le32 volDescSeqNum;
__le32 numAllocDescs;
- struct extent_ad allocDescs[0];
+ struct extent_ad allocDescs[];
} __packed;
/* Terminating Descriptor (ECMA 167r3 3/10.9) */
@@ -360,9 +360,9 @@ struct logicalVolIntegrityDesc {
uint8_t logicalVolContentsUse[32];
__le32 numOfPartitions;
__le32 lengthOfImpUse;
- __le32 freeSpaceTable[0];
- __le32 sizeTable[0];
- uint8_t impUse[0];
+ __le32 freeSpaceTable[];
+ /* __le32 sizeTable[]; */
+ /* uint8_t impUse[]; */
} __packed;
/* Integrity Type (ECMA 167r3 3/10.10.3) */
@@ -471,9 +471,9 @@ struct fileIdentDesc {
uint8_t lengthFileIdent;
struct long_ad icb;
__le16 lengthOfImpUse;
- uint8_t impUse[0];
- uint8_t fileIdent[0];
- uint8_t padding[0];
+ uint8_t impUse[];
+ /* uint8_t fileIdent[]; */
+ /* uint8_t padding[]; */
} __packed;
/* File Characteristics (ECMA 167r3 4/14.4.3) */
@@ -578,8 +578,8 @@ struct fileEntry {
__le64 uniqueID;
__le32 lengthExtendedAttr;
__le32 lengthAllocDescs;
- uint8_t extendedAttr[0];
- uint8_t allocDescs[0];
+ uint8_t extendedAttr[];
+ /* uint8_t allocDescs[]; */
} __packed;
/* Permissions (ECMA 167r3 4/14.9.5) */
@@ -632,7 +632,7 @@ struct genericFormat {
uint8_t attrSubtype;
uint8_t reserved[3];
__le32 attrLength;
- uint8_t attrData[0];
+ uint8_t attrData[];
} __packed;
/* Character Set Information (ECMA 167r3 4/14.10.3) */
@@ -643,7 +643,7 @@ struct charSetInfo {
__le32 attrLength;
__le32 escapeSeqLength;
uint8_t charSetType;
- uint8_t escapeSeq[0];
+ uint8_t escapeSeq[];
} __packed;
/* Alternate Permissions (ECMA 167r3 4/14.10.4) */
@@ -682,7 +682,7 @@ struct infoTimesExtAttr {
__le32 attrLength;
__le32 dataLength;
__le32 infoTimeExistence;
- uint8_t infoTimes[0];
+ uint8_t infoTimes[];
} __packed;
/* Device Specification (ECMA 167r3 4/14.10.7) */
@@ -694,7 +694,7 @@ struct deviceSpec {
__le32 impUseLength;
__le32 majorDeviceIdent;
__le32 minorDeviceIdent;
- uint8_t impUse[0];
+ uint8_t impUse[];
} __packed;
/* Implementation Use Extended Attr (ECMA 167r3 4/14.10.8) */
@@ -705,7 +705,7 @@ struct impUseExtAttr {
__le32 attrLength;
__le32 impUseLength;
struct regid impIdent;
- uint8_t impUse[0];
+ uint8_t impUse[];
} __packed;
/* Application Use Extended Attribute (ECMA 167r3 4/14.10.9) */
@@ -716,7 +716,7 @@ struct appUseExtAttr {
__le32 attrLength;
__le32 appUseLength;
struct regid appIdent;
- uint8_t appUse[0];
+ uint8_t appUse[];
} __packed;
#define EXTATTR_CHAR_SET 1
@@ -733,7 +733,7 @@ struct unallocSpaceEntry {
struct tag descTag;
struct icbtag icbTag;
__le32 lengthAllocDescs;
- uint8_t allocDescs[0];
+ uint8_t allocDescs[];
} __packed;
/* Space Bitmap Descriptor (ECMA 167r3 4/14.12) */
@@ -741,7 +741,7 @@ struct spaceBitmapDesc {
struct tag descTag;
__le32 numOfBits;
__le32 numOfBytes;
- uint8_t bitmap[0];
+ uint8_t bitmap[];
} __packed;
/* Partition Integrity Entry (ECMA 167r3 4/14.13) */
@@ -780,7 +780,7 @@ struct pathComponent {
uint8_t componentType;
uint8_t lengthComponentIdent;
__le16 componentFileVersionNum;
- dchars componentIdent[0];
+ dchars componentIdent[];
} __packed;
/* File Entry (ECMA 167r3 4/14.17) */
@@ -809,8 +809,8 @@ struct extendedFileEntry {
__le64 uniqueID;
__le32 lengthExtendedAttr;
__le32 lengthAllocDescs;
- uint8_t extendedAttr[0];
- uint8_t allocDescs[0];
+ uint8_t extendedAttr[];
+ /* uint8_t allocDescs[]; */
} __packed;
#endif /* _ECMA_167_H */
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 4917670860a0..1d6b7a50736b 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -390,8 +390,7 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode,
dfibh.eoffset += (sfibh.eoffset - sfibh.soffset);
dfi = (struct fileIdentDesc *)(dbh->b_data + dfibh.soffset);
if (udf_write_fi(inode, sfi, dfi, &dfibh, sfi->impUse,
- sfi->fileIdent +
- le16_to_cpu(sfi->lengthOfImpUse))) {
+ udf_get_fi_ident(sfi))) {
iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
brelse(dbh);
return NULL;
diff --git a/fs/udf/misc.c b/fs/udf/misc.c
index eab94527340d..1614d308d0f0 100644
--- a/fs/udf/misc.c
+++ b/fs/udf/misc.c
@@ -173,13 +173,22 @@ struct genericFormat *udf_get_extendedattr(struct inode *inode, uint32_t type,
else
offset = le32_to_cpu(eahd->appAttrLocation);
- while (offset < iinfo->i_lenEAttr) {
+ while (offset + sizeof(*gaf) < iinfo->i_lenEAttr) {
+ uint32_t attrLength;
+
gaf = (struct genericFormat *)&ea[offset];
+ attrLength = le32_to_cpu(gaf->attrLength);
+
+ /* Detect undersized elements and buffer overflows */
+ if ((attrLength < sizeof(*gaf)) ||
+ (attrLength > (iinfo->i_lenEAttr - offset)))
+ break;
+
if (le32_to_cpu(gaf->attrType) == type &&
gaf->attrSubtype == subtype)
return gaf;
else
- offset += le32_to_cpu(gaf->attrLength);
+ offset += attrLength;
}
}
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 7c7c9bbbfa57..caeef08efed2 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -74,12 +74,11 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
if (fileident) {
if (adinicb || (offset + lfi < 0)) {
- memcpy((uint8_t *)sfi->fileIdent + liu, fileident, lfi);
+ memcpy(udf_get_fi_ident(sfi), fileident, lfi);
} else if (offset >= 0) {
memcpy(fibh->ebh->b_data + offset, fileident, lfi);
} else {
- memcpy((uint8_t *)sfi->fileIdent + liu, fileident,
- -offset);
+ memcpy(udf_get_fi_ident(sfi), fileident, -offset);
memcpy(fibh->ebh->b_data, fileident - offset,
lfi + offset);
}
@@ -88,11 +87,11 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
offset += lfi;
if (adinicb || (offset + padlen < 0)) {
- memset((uint8_t *)sfi->padding + liu + lfi, 0x00, padlen);
+ memset(udf_get_fi_ident(sfi) + lfi, 0x00, padlen);
} else if (offset >= 0) {
memset(fibh->ebh->b_data + offset, 0x00, padlen);
} else {
- memset((uint8_t *)sfi->padding + liu + lfi, 0x00, -offset);
+ memset(udf_get_fi_ident(sfi) + lfi, 0x00, -offset);
memset(fibh->ebh->b_data, 0x00, padlen + offset);
}
@@ -226,7 +225,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
lfi = cfi->lengthFileIdent;
if (fibh->sbh == fibh->ebh) {
- nameptr = fi->fileIdent + liu;
+ nameptr = udf_get_fi_ident(fi);
} else {
int poffset; /* Unpaded ending offset */
@@ -246,7 +245,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
}
}
nameptr = copy_name;
- memcpy(nameptr, fi->fileIdent + liu,
+ memcpy(nameptr, udf_get_fi_ident(fi),
lfi - poffset);
memcpy(nameptr + lfi - poffset,
fibh->ebh->b_data, poffset);
diff --git a/fs/udf/osta_udf.h b/fs/udf/osta_udf.h
index 22bc4fb2feb9..157de0ec0cd5 100644
--- a/fs/udf/osta_udf.h
+++ b/fs/udf/osta_udf.h
@@ -111,7 +111,7 @@ struct logicalVolIntegrityDescImpUse {
__le16 minUDFReadRev;
__le16 minUDFWriteRev;
__le16 maxUDFWriteRev;
- uint8_t impUse[0];
+ uint8_t impUse[];
} __packed;
/* Implementation Use Volume Descriptor (UDF 2.60 2.2.7) */
@@ -178,15 +178,6 @@ struct metadataPartitionMap {
uint8_t reserved2[5];
} __packed;
-/* Virtual Allocation Table (UDF 1.5 2.2.10) */
-struct virtualAllocationTable15 {
- __le32 vatEntry[0];
- struct regid vatIdent;
- __le32 previousVATICBLoc;
-} __packed;
-
-#define ICBTAG_FILE_TYPE_VAT15 0x00U
-
/* Virtual Allocation Table (UDF 2.60 2.2.11) */
struct virtualAllocationTable20 {
__le16 lengthHeader;
@@ -199,8 +190,8 @@ struct virtualAllocationTable20 {
__le16 minUDFWriteRev;
__le16 maxUDFWriteRev;
__le16 reserved;
- uint8_t impUse[0];
- __le32 vatEntry[0];
+ uint8_t impUse[];
+ /* __le32 vatEntry[]; */
} __packed;
#define ICBTAG_FILE_TYPE_VAT20 0xF8U
@@ -217,8 +208,7 @@ struct sparingTable {
__le16 reallocationTableLen;
__le16 reserved;
__le32 sequenceNum;
- struct sparingEntry
- mapEntry[0];
+ struct sparingEntry mapEntry[];
} __packed;
/* Metadata File (and Metadata Mirror File) (UDF 2.60 2.2.13.1) */
@@ -241,7 +231,7 @@ struct allocDescImpUse {
/* FreeEASpace (UDF 2.60 3.3.4.5.1.1) */
struct freeEaSpace {
__le16 headerChecksum;
- uint8_t freeEASpace[0];
+ uint8_t freeEASpace[];
} __packed;
/* DVD Copyright Management Information (UDF 2.60 3.3.4.5.1.2) */
@@ -265,7 +255,7 @@ struct LVExtensionEA {
/* FreeAppEASpace (UDF 2.60 3.3.4.6.1) */
struct freeAppEASpace {
__le16 headerChecksum;
- uint8_t freeEASpace[0];
+ uint8_t freeEASpace[];
} __packed;
/* UDF Defined System Stream (UDF 2.60 3.3.7) */
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 2f83c1204e20..b2d7c57d0688 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -108,16 +108,10 @@ struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct super_block *sb)
return NULL;
lvid = (struct logicalVolIntegrityDesc *)UDF_SB(sb)->s_lvid_bh->b_data;
partnum = le32_to_cpu(lvid->numOfPartitions);
- if ((sb->s_blocksize - sizeof(struct logicalVolIntegrityDescImpUse) -
- offsetof(struct logicalVolIntegrityDesc, impUse)) /
- (2 * sizeof(uint32_t)) < partnum) {
- udf_err(sb, "Logical volume integrity descriptor corrupted "
- "(numOfPartitions = %u)!\n", partnum);
- return NULL;
- }
/* The offset is to skip freeSpaceTable and sizeTable arrays */
offset = partnum * 2 * sizeof(uint32_t);
- return (struct logicalVolIntegrityDescImpUse *)&(lvid->impUse[offset]);
+ return (struct logicalVolIntegrityDescImpUse *)
+ (((uint8_t *)(lvid + 1)) + offset);
}
/* UDF filesystem type */
@@ -349,10 +343,10 @@ static int udf_show_options(struct seq_file *seq, struct dentry *root)
seq_printf(seq, ",lastblock=%u", sbi->s_last_block);
if (sbi->s_anchor != 0)
seq_printf(seq, ",anchor=%u", sbi->s_anchor);
- if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8))
- seq_puts(seq, ",utf8");
- if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP) && sbi->s_nls_map)
+ if (sbi->s_nls_map)
seq_printf(seq, ",iocharset=%s", sbi->s_nls_map->charset);
+ else
+ seq_puts(seq, ",iocharset=utf8");
return 0;
}
@@ -558,19 +552,24 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
/* Ignored (never implemented properly) */
break;
case Opt_utf8:
- uopt->flags |= (1 << UDF_FLAG_UTF8);
+ if (!remount) {
+ unload_nls(uopt->nls_map);
+ uopt->nls_map = NULL;
+ }
break;
case Opt_iocharset:
if (!remount) {
- if (uopt->nls_map)
- unload_nls(uopt->nls_map);
- /*
- * load_nls() failure is handled later in
- * udf_fill_super() after all options are
- * parsed.
- */
+ unload_nls(uopt->nls_map);
+ uopt->nls_map = NULL;
+ }
+ /* When nls_map is not loaded then UTF-8 is used */
+ if (!remount && strcmp(args[0].from, "utf8") != 0) {
uopt->nls_map = load_nls(args[0].from);
- uopt->flags |= (1 << UDF_FLAG_NLS_MAP);
+ if (!uopt->nls_map) {
+ pr_err("iocharset %s not found\n",
+ args[0].from);
+ return 0;
+ }
}
break;
case Opt_uforget:
@@ -1542,6 +1541,7 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_
struct udf_sb_info *sbi = UDF_SB(sb);
struct logicalVolIntegrityDesc *lvid;
int indirections = 0;
+ u32 parts, impuselen;
while (++indirections <= UDF_MAX_LVID_NESTING) {
final_bh = NULL;
@@ -1568,15 +1568,27 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_
lvid = (struct logicalVolIntegrityDesc *)final_bh->b_data;
if (lvid->nextIntegrityExt.extLength == 0)
- return;
+ goto check;
loc = leea_to_cpu(lvid->nextIntegrityExt);
}
udf_warn(sb, "Too many LVID indirections (max %u), ignoring.\n",
UDF_MAX_LVID_NESTING);
+out_err:
brelse(sbi->s_lvid_bh);
sbi->s_lvid_bh = NULL;
+ return;
+check:
+ parts = le32_to_cpu(lvid->numOfPartitions);
+ impuselen = le32_to_cpu(lvid->lengthOfImpUse);
+ if (parts >= sb->s_blocksize || impuselen >= sb->s_blocksize ||
+ sizeof(struct logicalVolIntegrityDesc) + impuselen +
+ 2 * parts * sizeof(u32) > sb->s_blocksize) {
+ udf_warn(sb, "Corrupted LVID (parts=%u, impuselen=%u), "
+ "ignoring.\n", parts, impuselen);
+ goto out_err;
+ }
}
/*
@@ -2139,21 +2151,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
if (!udf_parse_options((char *)options, &uopt, false))
goto parse_options_failure;
- if (uopt.flags & (1 << UDF_FLAG_UTF8) &&
- uopt.flags & (1 << UDF_FLAG_NLS_MAP)) {
- udf_err(sb, "utf8 cannot be combined with iocharset\n");
- goto parse_options_failure;
- }
- if ((uopt.flags & (1 << UDF_FLAG_NLS_MAP)) && !uopt.nls_map) {
- uopt.nls_map = load_nls_default();
- if (!uopt.nls_map)
- uopt.flags &= ~(1 << UDF_FLAG_NLS_MAP);
- else
- udf_debug("Using default NLS map\n");
- }
- if (!(uopt.flags & (1 << UDF_FLAG_NLS_MAP)))
- uopt.flags |= (1 << UDF_FLAG_UTF8);
-
fileset.logicalBlockNum = 0xFFFFFFFF;
fileset.partitionReferenceNum = 0xFFFF;
@@ -2308,8 +2305,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
error_out:
iput(sbi->s_vat_inode);
parse_options_failure:
- if (uopt.nls_map)
- unload_nls(uopt.nls_map);
+ unload_nls(uopt.nls_map);
if (lvid_open)
udf_close_lvid(sb);
brelse(sbi->s_lvid_bh);
@@ -2359,8 +2355,7 @@ static void udf_put_super(struct super_block *sb)
sbi = UDF_SB(sb);
iput(sbi->s_vat_inode);
- if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
- unload_nls(sbi->s_nls_map);
+ unload_nls(sbi->s_nls_map);
if (!sb_rdonly(sb))
udf_close_lvid(sb);
brelse(sbi->s_lvid_bh);
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 758efe557a19..4fa620543d30 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -20,8 +20,6 @@
#define UDF_FLAG_UNDELETE 6
#define UDF_FLAG_UNHIDE 7
#define UDF_FLAG_VARCONV 8
-#define UDF_FLAG_NLS_MAP 9
-#define UDF_FLAG_UTF8 10
#define UDF_FLAG_UID_FORGET 11 /* save -1 for uid to disk */
#define UDF_FLAG_GID_FORGET 12
#define UDF_FLAG_UID_SET 13
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 9dd0814f1077..7e258f15b8ef 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -130,6 +130,10 @@ static inline unsigned int udf_dir_entry_len(struct fileIdentDesc *cfi)
le16_to_cpu(cfi->lengthOfImpUse) + cfi->lengthFileIdent,
UDF_NAME_PAD);
}
+static inline uint8_t *udf_get_fi_ident(struct fileIdentDesc *fi)
+{
+ return ((uint8_t *)(fi + 1)) + le16_to_cpu(fi->lengthOfImpUse);
+}
/* file.c */
extern long udf_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index 5fcfa96463eb..622569007b53 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -177,7 +177,7 @@ static int udf_name_from_CS0(struct super_block *sb,
return 0;
}
- if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
+ if (UDF_SB(sb)->s_nls_map)
conv_f = UDF_SB(sb)->s_nls_map->uni2char;
else
conv_f = NULL;
@@ -285,7 +285,7 @@ static int udf_name_to_CS0(struct super_block *sb,
if (ocu_max_len <= 0)
return 0;
- if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
+ if (UDF_SB(sb)->s_nls_map)
conv_f = UDF_SB(sb)->s_nls_map->char2uni;
else
conv_f = NULL;
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 213a97a921bb..1cd3f940fa6a 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1626,7 +1626,6 @@ xfs_swap_extents(
struct xfs_bstat *sbp = &sxp->sx_stat;
int src_log_flags, target_log_flags;
int error = 0;
- int lock_flags;
uint64_t f;
int resblks = 0;
unsigned int flags = 0;
@@ -1638,8 +1637,8 @@ xfs_swap_extents(
* do the rest of the checks.
*/
lock_two_nondirectories(VFS_I(ip), VFS_I(tip));
- lock_flags = XFS_MMAPLOCK_EXCL;
- xfs_lock_two_inodes(ip, XFS_MMAPLOCK_EXCL, tip, XFS_MMAPLOCK_EXCL);
+ filemap_invalidate_lock_two(VFS_I(ip)->i_mapping,
+ VFS_I(tip)->i_mapping);
/* Verify that both files have the same format */
if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) {
@@ -1711,7 +1710,6 @@ xfs_swap_extents(
* or cancel will unlock the inodes from this point onwards.
*/
xfs_lock_two_inodes(ip, XFS_ILOCK_EXCL, tip, XFS_ILOCK_EXCL);
- lock_flags |= XFS_ILOCK_EXCL;
xfs_trans_ijoin(tp, ip, 0);
xfs_trans_ijoin(tp, tip, 0);
@@ -1830,13 +1828,16 @@ xfs_swap_extents(
trace_xfs_swap_extent_after(ip, 0);
trace_xfs_swap_extent_after(tip, 1);
+out_unlock_ilock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(tip, XFS_ILOCK_EXCL);
out_unlock:
- xfs_iunlock(ip, lock_flags);
- xfs_iunlock(tip, lock_flags);
+ filemap_invalidate_unlock_two(VFS_I(ip)->i_mapping,
+ VFS_I(tip)->i_mapping);
unlock_two_nondirectories(VFS_I(ip), VFS_I(tip));
return error;
out_trans_cancel:
xfs_trans_cancel(tp);
- goto out_unlock;
+ goto out_unlock_ilock;
}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 8ff42b3585e0..3ab73567a0f5 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -844,7 +844,7 @@ xfs_buf_readahead_map(
{
struct xfs_buf *bp;
- if (bdi_read_congested(target->bt_bdev->bd_bdi))
+ if (bdi_read_congested(target->bt_bdev->bd_disk->bdi))
return;
xfs_buf_read_map(target, map, nmaps,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index cc3cfb12df53..3dfbdcdb0d1c 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1302,7 +1302,7 @@ xfs_file_llseek(
*
* mmap_lock (MM)
* sb_start_pagefault(vfs, freeze)
- * i_mmaplock (XFS - truncate serialisation)
+ * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
* page_lock (MM)
* i_lock (XFS - extent map serialisation)
*/
@@ -1323,24 +1323,27 @@ __xfs_filemap_fault(
file_update_time(vmf->vma->vm_file);
}
- xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (IS_DAX(inode)) {
pfn_t pfn;
+ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
(write_fault && !vmf->cow_page) ?
&xfs_direct_write_iomap_ops :
&xfs_read_iomap_ops);
if (ret & VM_FAULT_NEEDDSYNC)
ret = dax_finish_sync_fault(vmf, pe_size, pfn);
+ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
} else {
- if (write_fault)
+ if (write_fault) {
+ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
ret = iomap_page_mkwrite(vmf,
&xfs_buffered_write_iomap_ops);
- else
+ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ } else {
ret = filemap_fault(vmf);
+ }
}
- xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (write_fault)
sb_end_pagefault(inode->i_sb);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 990b72ae3635..f00145e1a976 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -132,7 +132,7 @@ xfs_ilock_attr_map_shared(
/*
* In addition to i_rwsem in the VFS inode, the xfs inode contains 2
- * multi-reader locks: i_mmap_lock and the i_lock. This routine allows
+ * multi-reader locks: invalidate_lock and the i_lock. This routine allows
* various combinations of the locks to be obtained.
*
* The 3 locks should always be ordered so that the IO lock is obtained first,
@@ -140,23 +140,23 @@ xfs_ilock_attr_map_shared(
*
* Basic locking order:
*
- * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock
+ * i_rwsem -> invalidate_lock -> page_lock -> i_ilock
*
* mmap_lock locking order:
*
* i_rwsem -> page lock -> mmap_lock
- * mmap_lock -> i_mmap_lock -> page_lock
+ * mmap_lock -> invalidate_lock -> page_lock
*
* The difference in mmap_lock locking order mean that we cannot hold the
- * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
- * fault in pages during copy in/out (for buffered IO) or require the mmap_lock
- * in get_user_pages() to map the user pages into the kernel address space for
- * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because
- * page faults already hold the mmap_lock.
+ * invalidate_lock over syscall based read(2)/write(2) based IO. These IO paths
+ * can fault in pages during copy in/out (for buffered IO) or require the
+ * mmap_lock in get_user_pages() to map the user pages into the kernel address
+ * space for direct IO. Similarly the i_rwsem cannot be taken inside a page
+ * fault because page faults already hold the mmap_lock.
*
* Hence to serialise fully against both syscall and mmap based IO, we need to
- * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both
- * taken in places where we need to invalidate the page cache in a race
+ * take both the i_rwsem and the invalidate_lock. These locks should *only* be
+ * both taken in places where we need to invalidate the page cache in a race
* free manner (e.g. truncate, hole punch and other extent manipulation
* functions).
*/
@@ -188,10 +188,13 @@ xfs_ilock(
XFS_IOLOCK_DEP(lock_flags));
}
- if (lock_flags & XFS_MMAPLOCK_EXCL)
- mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
- else if (lock_flags & XFS_MMAPLOCK_SHARED)
- mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
+ if (lock_flags & XFS_MMAPLOCK_EXCL) {
+ down_write_nested(&VFS_I(ip)->i_mapping->invalidate_lock,
+ XFS_MMAPLOCK_DEP(lock_flags));
+ } else if (lock_flags & XFS_MMAPLOCK_SHARED) {
+ down_read_nested(&VFS_I(ip)->i_mapping->invalidate_lock,
+ XFS_MMAPLOCK_DEP(lock_flags));
+ }
if (lock_flags & XFS_ILOCK_EXCL)
mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
@@ -240,10 +243,10 @@ xfs_ilock_nowait(
}
if (lock_flags & XFS_MMAPLOCK_EXCL) {
- if (!mrtryupdate(&ip->i_mmaplock))
+ if (!down_write_trylock(&VFS_I(ip)->i_mapping->invalidate_lock))
goto out_undo_iolock;
} else if (lock_flags & XFS_MMAPLOCK_SHARED) {
- if (!mrtryaccess(&ip->i_mmaplock))
+ if (!down_read_trylock(&VFS_I(ip)->i_mapping->invalidate_lock))
goto out_undo_iolock;
}
@@ -258,9 +261,9 @@ xfs_ilock_nowait(
out_undo_mmaplock:
if (lock_flags & XFS_MMAPLOCK_EXCL)
- mrunlock_excl(&ip->i_mmaplock);
+ up_write(&VFS_I(ip)->i_mapping->invalidate_lock);
else if (lock_flags & XFS_MMAPLOCK_SHARED)
- mrunlock_shared(&ip->i_mmaplock);
+ up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
out_undo_iolock:
if (lock_flags & XFS_IOLOCK_EXCL)
up_write(&VFS_I(ip)->i_rwsem);
@@ -307,9 +310,9 @@ xfs_iunlock(
up_read(&VFS_I(ip)->i_rwsem);
if (lock_flags & XFS_MMAPLOCK_EXCL)
- mrunlock_excl(&ip->i_mmaplock);
+ up_write(&VFS_I(ip)->i_mapping->invalidate_lock);
else if (lock_flags & XFS_MMAPLOCK_SHARED)
- mrunlock_shared(&ip->i_mmaplock);
+ up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
if (lock_flags & XFS_ILOCK_EXCL)
mrunlock_excl(&ip->i_lock);
@@ -335,7 +338,7 @@ xfs_ilock_demote(
if (lock_flags & XFS_ILOCK_EXCL)
mrdemote(&ip->i_lock);
if (lock_flags & XFS_MMAPLOCK_EXCL)
- mrdemote(&ip->i_mmaplock);
+ downgrade_write(&VFS_I(ip)->i_mapping->invalidate_lock);
if (lock_flags & XFS_IOLOCK_EXCL)
downgrade_write(&VFS_I(ip)->i_rwsem);
@@ -343,9 +346,29 @@ xfs_ilock_demote(
}
#if defined(DEBUG) || defined(XFS_WARN)
-int
+static inline bool
+__xfs_rwsem_islocked(
+ struct rw_semaphore *rwsem,
+ bool shared)
+{
+ if (!debug_locks)
+ return rwsem_is_locked(rwsem);
+
+ if (!shared)
+ return lockdep_is_held_type(rwsem, 0);
+
+ /*
+ * We are checking that the lock is held at least in shared
+ * mode but don't care that it might be held exclusively
+ * (i.e. shared | excl). Hence we check if the lock is held
+ * in any mode rather than an explicit shared mode.
+ */
+ return lockdep_is_held_type(rwsem, -1);
+}
+
+bool
xfs_isilocked(
- xfs_inode_t *ip,
+ struct xfs_inode *ip,
uint lock_flags)
{
if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
@@ -355,20 +378,17 @@ xfs_isilocked(
}
if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
- if (!(lock_flags & XFS_MMAPLOCK_SHARED))
- return !!ip->i_mmaplock.mr_writer;
- return rwsem_is_locked(&ip->i_mmaplock.mr_lock);
+ return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem,
+ (lock_flags & XFS_IOLOCK_SHARED));
}
- if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
- if (!(lock_flags & XFS_IOLOCK_SHARED))
- return !debug_locks ||
- lockdep_is_held_type(&VFS_I(ip)->i_rwsem, 0);
- return rwsem_is_locked(&VFS_I(ip)->i_rwsem);
+ if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) {
+ return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem,
+ (lock_flags & XFS_IOLOCK_SHARED));
}
ASSERT(0);
- return 0;
+ return false;
}
#endif
@@ -532,12 +552,10 @@ again:
}
/*
- * xfs_lock_two_inodes() can only be used to lock one type of lock at a time -
- * the mmaplock or the ilock, but not more than one type at a time. If we lock
- * more than one at a time, lockdep will report false positives saying we have
- * violated locking orders. The iolock must be double-locked separately since
- * we use i_rwsem for that. We now support taking one lock EXCL and the other
- * SHARED.
+ * xfs_lock_two_inodes() can only be used to lock ilock. The iolock and
+ * mmaplock must be double-locked separately since we use i_rwsem and
+ * invalidate_lock for that. We now support taking one lock EXCL and the
+ * other SHARED.
*/
void
xfs_lock_two_inodes(
@@ -555,15 +573,8 @@ xfs_lock_two_inodes(
ASSERT(hweight32(ip1_mode) == 1);
ASSERT(!(ip0_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
ASSERT(!(ip1_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
- ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
- !(ip0_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
- ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
- !(ip1_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
- ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
- !(ip0_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
- ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
- !(ip1_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
-
+ ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
+ ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
ASSERT(ip0->i_ino != ip1->i_ino);
if (ip0->i_ino > ip1->i_ino) {
@@ -3741,11 +3752,8 @@ xfs_ilock2_io_mmap(
ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2));
if (ret)
return ret;
- if (ip1 == ip2)
- xfs_ilock(ip1, XFS_MMAPLOCK_EXCL);
- else
- xfs_lock_two_inodes(ip1, XFS_MMAPLOCK_EXCL,
- ip2, XFS_MMAPLOCK_EXCL);
+ filemap_invalidate_lock_two(VFS_I(ip1)->i_mapping,
+ VFS_I(ip2)->i_mapping);
return 0;
}
@@ -3755,12 +3763,9 @@ xfs_iunlock2_io_mmap(
struct xfs_inode *ip1,
struct xfs_inode *ip2)
{
- bool same_inode = (ip1 == ip2);
-
- xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
- if (!same_inode)
- xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
+ filemap_invalidate_unlock_two(VFS_I(ip1)->i_mapping,
+ VFS_I(ip2)->i_mapping);
inode_unlock(VFS_I(ip2));
- if (!same_inode)
+ if (ip1 != ip2)
inode_unlock(VFS_I(ip1));
}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 4b6703dbffb8..e0ae905554e2 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -40,7 +40,6 @@ typedef struct xfs_inode {
/* Transaction and locking information. */
struct xfs_inode_log_item *i_itemp; /* logging information */
mrlock_t i_lock; /* inode lock */
- mrlock_t i_mmaplock; /* inode mmap IO lock */
atomic_t i_pincount; /* inode pin count */
/*
@@ -410,7 +409,7 @@ void xfs_ilock(xfs_inode_t *, uint);
int xfs_ilock_nowait(xfs_inode_t *, uint);
void xfs_iunlock(xfs_inode_t *, uint);
void xfs_ilock_demote(xfs_inode_t *, uint);
-int xfs_isilocked(xfs_inode_t *, uint);
+bool xfs_isilocked(struct xfs_inode *, uint);
uint xfs_ilock_data_map_shared(struct xfs_inode *);
uint xfs_ilock_attr_map_shared(struct xfs_inode *);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 2c9e26a44546..102cbd606633 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -709,8 +709,6 @@ xfs_fs_inode_init_once(
atomic_set(&ip->i_pincount, 0);
spin_lock_init(&ip->i_flags_lock);
- mrlock_init(&ip->i_mmaplock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
- "xfsino", ip->i_ino);
mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
"xfsino", ip->i_ino);
}
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 70055d486bf7..ddc346a9df9b 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -462,7 +462,7 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize)
inode_dio_wait(inode);
/* Serialize against page faults */
- down_write(&zi->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
/* Serialize against zonefs_iomap_begin() */
mutex_lock(&zi->i_truncate_mutex);
@@ -500,7 +500,7 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize)
unlock:
mutex_unlock(&zi->i_truncate_mutex);
- up_write(&zi->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
return ret;
}
@@ -575,18 +575,6 @@ static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end,
return ret;
}
-static vm_fault_t zonefs_filemap_fault(struct vm_fault *vmf)
-{
- struct zonefs_inode_info *zi = ZONEFS_I(file_inode(vmf->vma->vm_file));
- vm_fault_t ret;
-
- down_read(&zi->i_mmap_sem);
- ret = filemap_fault(vmf);
- up_read(&zi->i_mmap_sem);
-
- return ret;
-}
-
static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
@@ -607,16 +595,16 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
file_update_time(vmf->vma->vm_file);
/* Serialize against truncates */
- down_read(&zi->i_mmap_sem);
+ filemap_invalidate_lock_shared(inode->i_mapping);
ret = iomap_page_mkwrite(vmf, &zonefs_iomap_ops);
- up_read(&zi->i_mmap_sem);
+ filemap_invalidate_unlock_shared(inode->i_mapping);
sb_end_pagefault(inode->i_sb);
return ret;
}
static const struct vm_operations_struct zonefs_file_vm_ops = {
- .fault = zonefs_filemap_fault,
+ .fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = zonefs_filemap_page_mkwrite,
};
@@ -1155,7 +1143,6 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb)
inode_init_once(&zi->i_vnode);
mutex_init(&zi->i_truncate_mutex);
- init_rwsem(&zi->i_mmap_sem);
zi->i_wr_refcnt = 0;
return &zi->i_vnode;
diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h
index 51141907097c..7b147907c328 100644
--- a/fs/zonefs/zonefs.h
+++ b/fs/zonefs/zonefs.h
@@ -70,12 +70,11 @@ struct zonefs_inode_info {
* and changes to the inode private data, and in particular changes to
* a sequential file size on completion of direct IO writes.
* Serialization of mmap read IOs with truncate and syscall IO
- * operations is done with i_mmap_sem in addition to i_truncate_mutex.
- * Only zonefs_seq_file_truncate() takes both lock (i_mmap_sem first,
- * i_truncate_mutex second).
+ * operations is done with invalidate_lock in addition to
+ * i_truncate_mutex. Only zonefs_seq_file_truncate() takes both lock
+ * (invalidate_lock first, i_truncate_mutex second).
*/
struct mutex i_truncate_mutex;
- struct rw_semaphore i_mmap_sem;
/* guarded by i_truncate_mutex */
unsigned int i_wr_refcnt;