summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/afs/cell.c9
-rw-r--r--fs/afs/dir.c2
-rw-r--r--fs/afs/fs_operation.c4
-rw-r--r--fs/afs/fs_probe.c11
-rw-r--r--fs/afs/internal.h3
-rw-r--r--fs/afs/main.c3
-rw-r--r--fs/afs/server.c3
-rw-r--r--fs/afs/write.c1
-rw-r--r--fs/autofs/waitq.c2
-rw-r--r--fs/block_dev.c17
-rw-r--r--fs/btrfs/backref.c1
-rw-r--r--fs/btrfs/block-group.c44
-rw-r--r--fs/btrfs/ctree.c2
-rw-r--r--fs/btrfs/ctree.h2
-rw-r--r--fs/btrfs/discard.c1
-rw-r--r--fs/btrfs/disk-io.c6
-rw-r--r--fs/btrfs/extent_io.c43
-rw-r--r--fs/btrfs/file.c47
-rw-r--r--fs/btrfs/inode.c71
-rw-r--r--fs/btrfs/ioctl.c2
-rw-r--r--fs/btrfs/ref-verify.c2
-rw-r--r--fs/btrfs/space-info.c2
-rw-r--r--fs/btrfs/super.c6
-rw-r--r--fs/btrfs/tree-log.c5
-rw-r--r--fs/btrfs/volumes.c8
-rw-r--r--fs/btrfs/volumes.h2
-rw-r--r--fs/cachefiles/rdwr.c2
-rw-r--r--fs/cifs/cifs_debug.c6
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/connect.c10
-rw-r--r--fs/cifs/file.c30
-rw-r--r--fs/cifs/inode.c9
-rw-r--r--fs/cifs/ioctl.c9
-rw-r--r--fs/cifs/misc.c16
-rw-r--r--fs/cifs/smb2misc.c8
-rw-r--r--fs/cifs/smb2ops.c14
-rw-r--r--fs/cifs/transport.c2
-rw-r--r--fs/efivarfs/file.c7
-rw-r--r--fs/efivarfs/super.c6
-rw-r--r--fs/erofs/zdata.h20
-rw-r--r--fs/exfat/dir.c14
-rw-r--r--fs/exfat/exfat_fs.h3
-rw-r--r--fs/exfat/file.c21
-rw-r--r--fs/exfat/namei.c14
-rw-r--r--fs/exfat/nls.c8
-rw-r--r--fs/exfat/super.c10
-rw-r--r--fs/fuse/file.c132
-rw-r--r--fs/fuse/inode.c19
-rw-r--r--fs/gfs2/aops.c45
-rw-r--r--fs/gfs2/file.c52
-rw-r--r--fs/gfs2/glock.c5
-rw-r--r--fs/gfs2/glops.c10
-rw-r--r--fs/gfs2/incore.h1
-rw-r--r--fs/gfs2/inode.c3
-rw-r--r--fs/gfs2/log.c25
-rw-r--r--fs/gfs2/log.h4
-rw-r--r--fs/gfs2/main.c1
-rw-r--r--fs/gfs2/ops_fstype.c13
-rw-r--r--fs/gfs2/recovery.c4
-rw-r--r--fs/gfs2/super.c20
-rw-r--r--fs/io-wq.c108
-rw-r--r--fs/io-wq.h4
-rw-r--r--fs/io_uring.c339
-rw-r--r--fs/namespace.c1
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c11
-rw-r--r--fs/nfs/nfs4namespace.c1
-rw-r--r--fs/nfs/nfs4proc.c20
-rw-r--r--fs/nfsd/nfs4state.c28
-rw-r--r--fs/nfsd/nfsctl.c23
-rw-r--r--fs/nfsd/nfsd.h3
-rw-r--r--fs/nfsd/vfs.c6
-rw-r--r--fs/ocfs2/dlmglue.c17
-rw-r--r--fs/ocfs2/ocfs2.h1
-rw-r--r--fs/ocfs2/ocfs2_fs.h4
-rw-r--r--fs/ocfs2/suballoc.c9
-rw-r--r--fs/overlayfs/copy_up.c2
-rw-r--r--fs/overlayfs/export.c2
-rw-r--r--fs/overlayfs/file.c10
-rw-r--r--fs/overlayfs/namei.c15
-rw-r--r--fs/overlayfs/overlayfs.h1
-rw-r--r--fs/overlayfs/super.c73
-rw-r--r--fs/proc/bootconfig.c15
-rw-r--r--fs/proc/kcore.c3
-rw-r--r--fs/proc/proc_sysctl.c6
-rw-r--r--fs/read_write.c131
-rw-r--r--fs/squashfs/block.c2
-rw-r--r--fs/xfs/xfs_log_cil.c10
-rw-r--r--fs/xfs/xfs_log_priv.h2
-rw-r--r--fs/zonefs/super.c18
89 files changed, 1109 insertions, 600 deletions
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 005921e3b38d..5b79cdceefa0 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -154,10 +154,17 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
return ERR_PTR(-ENOMEM);
}
+ cell->name = kmalloc(namelen + 1, GFP_KERNEL);
+ if (!cell->name) {
+ kfree(cell);
+ return ERR_PTR(-ENOMEM);
+ }
+
cell->net = net;
cell->name_len = namelen;
for (i = 0; i < namelen; i++)
cell->name[i] = tolower(name[i]);
+ cell->name[i] = 0;
atomic_set(&cell->usage, 2);
INIT_WORK(&cell->manager, afs_manage_cell);
@@ -207,6 +214,7 @@ parse_failed:
if (ret == -EINVAL)
printk(KERN_ERR "kAFS: bad VL server IP address\n");
error:
+ kfree(cell->name);
kfree(cell);
_leave(" = %d", ret);
return ERR_PTR(ret);
@@ -489,6 +497,7 @@ static void afs_cell_destroy(struct rcu_head *rcu)
afs_put_vlserverlist(cell->net, rcu_access_pointer(cell->vl_servers));
afs_put_cell(cell->net, cell->alias_of);
key_put(cell->anonymous_key);
+ kfree(cell->name);
kfree(cell);
_leave(" [destroyed]");
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 3e3c2bf0a722..96757f3abd74 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -845,7 +845,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
* to FS.FetchStatus for op->file[1].
*/
op->fetch_status.which = 1;
- op->ops = &afs_fetch_status_operation;
+ op->ops = &afs_lookup_fetch_status_operation;
afs_begin_vnode_operation(op);
afs_wait_for_operation(op);
}
diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c
index c264839b2fd0..24fd163c6323 100644
--- a/fs/afs/fs_operation.c
+++ b/fs/afs/fs_operation.c
@@ -71,7 +71,7 @@ static bool afs_get_io_locks(struct afs_operation *op)
swap(vnode, vnode2);
if (mutex_lock_interruptible(&vnode->io_lock) < 0) {
- op->error = -EINTR;
+ op->error = -ERESTARTSYS;
op->flags |= AFS_OPERATION_STOP;
_leave(" = f [I 0]");
return false;
@@ -80,7 +80,7 @@ static bool afs_get_io_locks(struct afs_operation *op)
if (vnode2) {
if (mutex_lock_interruptible_nested(&vnode2->io_lock, 1) < 0) {
- op->error = -EINTR;
+ op->error = -ERESTARTSYS;
op->flags |= AFS_OPERATION_STOP;
mutex_unlock(&vnode->io_lock);
op->flags &= ~AFS_OPERATION_LOCK_0;
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index b34f74b0f319..5d9ef517cf81 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -314,7 +314,7 @@ void afs_fs_probe_timer(struct timer_list *timer)
{
struct afs_net *net = container_of(timer, struct afs_net, fs_probe_timer);
- if (!queue_work(afs_wq, &net->fs_prober))
+ if (!net->live || !queue_work(afs_wq, &net->fs_prober))
afs_dec_servers_outstanding(net);
}
@@ -458,3 +458,12 @@ dont_wait:
return -ETIME;
return -EDESTADDRREQ;
}
+
+/*
+ * Clean up the probing when the namespace is killed off.
+ */
+void afs_fs_probe_cleanup(struct afs_net *net)
+{
+ if (del_timer_sync(&net->fs_probe_timer))
+ afs_dec_servers_outstanding(net);
+}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 573a5922c3bb..792ac711985e 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -388,7 +388,7 @@ struct afs_cell {
struct afs_vlserver_list __rcu *vl_servers;
u8 name_len; /* Length of name */
- char name[64 + 1]; /* Cell name, case-flattened and NUL-padded */
+ char *name; /* Cell name, case-flattened and NUL-padded */
};
/*
@@ -1065,6 +1065,7 @@ extern int afs_wait_for_fs_probes(struct afs_server_list *, unsigned long);
extern void afs_probe_fileserver(struct afs_net *, struct afs_server *);
extern void afs_fs_probe_dispatcher(struct work_struct *);
extern int afs_wait_for_one_fs_probe(struct afs_server *, bool);
+extern void afs_fs_probe_cleanup(struct afs_net *);
/*
* inode.c
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 9c79c91e8005..31b472f7c734 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -100,6 +100,7 @@ static int __net_init afs_net_init(struct net *net_ns)
timer_setup(&net->fs_timer, afs_servers_timer, 0);
INIT_WORK(&net->fs_prober, afs_fs_probe_dispatcher);
timer_setup(&net->fs_probe_timer, afs_fs_probe_timer, 0);
+ atomic_set(&net->servers_outstanding, 1);
ret = -ENOMEM;
sysnames = kzalloc(sizeof(*sysnames), GFP_KERNEL);
@@ -130,6 +131,7 @@ static int __net_init afs_net_init(struct net *net_ns)
error_open_socket:
net->live = false;
+ afs_fs_probe_cleanup(net);
afs_cell_purge(net);
afs_purge_servers(net);
error_cell_init:
@@ -150,6 +152,7 @@ static void __net_exit afs_net_exit(struct net *net_ns)
struct afs_net *net = afs_net(net_ns);
net->live = false;
+ afs_fs_probe_cleanup(net);
afs_cell_purge(net);
afs_purge_servers(net);
afs_close_socket(net);
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 039e3488511c..e82e452e2612 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -605,11 +605,12 @@ void afs_purge_servers(struct afs_net *net)
_enter("");
if (del_timer_sync(&net->fs_timer))
- atomic_dec(&net->servers_outstanding);
+ afs_dec_servers_outstanding(net);
afs_queue_server_manager(net);
_debug("wait");
+ atomic_dec(&net->servers_outstanding);
wait_var_event(&net->servers_outstanding,
!atomic_read(&net->servers_outstanding));
_leave("");
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 7437806332d9..a121c247d95a 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -449,6 +449,7 @@ static int afs_store_data(struct address_space *mapping,
op->store.first_offset = offset;
op->store.last_to = to;
op->mtime = vnode->vfs_inode.i_mtime;
+ op->flags |= AFS_OPERATION_UNINTR;
op->ops = &afs_store_data_operation;
try_next_key:
diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c
index b04c528b19d3..74c886f7c51c 100644
--- a/fs/autofs/waitq.c
+++ b/fs/autofs/waitq.c
@@ -53,7 +53,7 @@ static int autofs_write(struct autofs_sb_info *sbi,
mutex_lock(&sbi->pipe_mutex);
while (bytes) {
- wr = __kernel_write(file, data, bytes, &file->f_pos);
+ wr = kernel_write(file, data, bytes, &file->f_pos);
if (wr <= 0)
break;
data += wr;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 47860e589388..0ae656e022fd 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -75,7 +75,7 @@ static void bdev_write_inode(struct block_device *bdev)
}
/* Kill _all_ buffers and pagecache , dirty or not.. */
-void kill_bdev(struct block_device *bdev)
+static void kill_bdev(struct block_device *bdev)
{
struct address_space *mapping = bdev->bd_inode->i_mapping;
@@ -84,8 +84,7 @@ void kill_bdev(struct block_device *bdev)
invalidate_bh_lrus();
truncate_inode_pages(mapping, 0);
-}
-EXPORT_SYMBOL(kill_bdev);
+}
/* Invalidate clean unused buffers and pagecache. */
void invalidate_bdev(struct block_device *bdev)
@@ -1565,10 +1564,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
*/
if (!for_part) {
ret = devcgroup_inode_permission(bdev->bd_inode, perm);
- if (ret != 0) {
- bdput(bdev);
+ if (ret != 0)
return ret;
- }
}
restart:
@@ -1637,8 +1634,10 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
goto out_clear;
BUG_ON(for_part);
ret = __blkdev_get(whole, mode, 1);
- if (ret)
+ if (ret) {
+ bdput(whole);
goto out_clear;
+ }
bdev->bd_contains = whole;
bdev->bd_part = disk_get_part(disk, partno);
if (!(disk->flags & GENHD_FL_UP) ||
@@ -1688,7 +1687,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
disk_unblock_events(disk);
put_disk_and_module(disk);
out:
- bdput(bdev);
return ret;
}
@@ -1755,6 +1753,9 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
bdput(whole);
}
+ if (res)
+ bdput(bdev);
+
return res;
}
EXPORT_SYMBOL(blkdev_get);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index d888e71e66b6..ea10f7bc99ab 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1461,6 +1461,7 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
if (ret < 0 && ret != -ENOENT) {
ulist_free(tmp);
ulist_free(*roots);
+ *roots = NULL;
return ret;
}
node = ulist_next(tmp, &uiter);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 176e8a292fd1..c037ef514b64 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -940,7 +940,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
- goto out_put_group;
+ goto out;
}
/*
@@ -978,7 +978,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
if (ret) {
btrfs_add_delayed_iput(inode);
- goto out_put_group;
+ goto out;
}
clear_nlink(inode);
/* One for the block groups ref */
@@ -1001,13 +1001,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
if (ret < 0)
- goto out_put_group;
+ goto out;
if (ret > 0)
btrfs_release_path(path);
if (ret == 0) {
ret = btrfs_del_item(trans, tree_root, path);
if (ret)
- goto out_put_group;
+ goto out;
btrfs_release_path(path);
}
@@ -1016,6 +1016,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
&fs_info->block_group_cache_tree);
RB_CLEAR_NODE(&block_group->cache_node);
+ /* Once for the block groups rbtree */
+ btrfs_put_block_group(block_group);
+
if (fs_info->first_logical_byte == block_group->start)
fs_info->first_logical_byte = (u64)-1;
spin_unlock(&fs_info->block_group_cache_lock);
@@ -1089,6 +1092,25 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&block_group->space_info->lock);
+ /*
+ * Remove the free space for the block group from the free space tree
+ * and the block group's item from the extent tree before marking the
+ * block group as removed. This is to prevent races with tasks that
+ * freeze and unfreeze a block group, this task and another task
+ * allocating a new block group - the unfreeze task ends up removing
+ * the block group's extent map before the task calling this function
+ * deletes the block group item from the extent tree, allowing for
+ * another task to attempt to create another block group with the same
+ * item key (and failing with -EEXIST and a transaction abort).
+ */
+ ret = remove_block_group_free_space(trans, block_group);
+ if (ret)
+ goto out;
+
+ ret = remove_block_group_item(trans, path, block_group);
+ if (ret < 0)
+ goto out;
+
mutex_lock(&fs_info->chunk_mutex);
spin_lock(&block_group->lock);
block_group->removed = 1;
@@ -1123,17 +1145,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
mutex_unlock(&fs_info->chunk_mutex);
- ret = remove_block_group_free_space(trans, block_group);
- if (ret)
- goto out_put_group;
-
- /* Once for the block groups rbtree */
- btrfs_put_block_group(block_group);
-
- ret = remove_block_group_item(trans, path, block_group);
- if (ret < 0)
- goto out;
-
if (remove_em) {
struct extent_map_tree *em_tree;
@@ -1145,10 +1156,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
free_extent_map(em);
}
-out_put_group:
+out:
/* Once for the lookup reference */
btrfs_put_block_group(block_group);
-out:
if (remove_rsv)
btrfs_delayed_refs_rsv_release(fs_info, 1);
btrfs_free_path(path);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 3a7648bff42c..82ab6e5a386d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1196,7 +1196,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
switch (tm->op) {
case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
BUG_ON(tm->slot < n);
- /* Fallthrough */
+ fallthrough;
case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
case MOD_LOG_KEY_REMOVE:
btrfs_set_node_key(eb, &tm->key, tm->slot);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 30ce7039bc27..d404cce8ae40 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1009,6 +1009,8 @@ enum {
BTRFS_ROOT_DEAD_RELOC_TREE,
/* Mark dead root stored on device whose cleanup needs to be resumed */
BTRFS_ROOT_DEAD_TREE,
+ /* The root has a log tree. Used only for subvolume roots. */
+ BTRFS_ROOT_HAS_LOG_TREE,
};
/*
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index 5615320fa659..741c7e19c32f 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -619,6 +619,7 @@ void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info)
list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs,
bg_list) {
list_del_init(&block_group->bg_list);
+ btrfs_put_block_group(block_group);
btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
}
spin_unlock(&fs_info->unused_bgs_lock);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7c6f0bbb54a5..b1a148058773 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2593,10 +2593,12 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
!extent_buffer_uptodate(tree_root->node)) {
handle_error = true;
- if (IS_ERR(tree_root->node))
+ if (IS_ERR(tree_root->node)) {
ret = PTR_ERR(tree_root->node);
- else if (!extent_buffer_uptodate(tree_root->node))
+ tree_root->node = NULL;
+ } else if (!extent_buffer_uptodate(tree_root->node)) {
ret = -EUCLEAN;
+ }
btrfs_warn(fs_info, "failed to read tree root");
continue;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 68c96057ad2d..60278e52c37a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1999,7 +1999,8 @@ static int __process_pages_contig(struct address_space *mapping,
if (!PageDirty(pages[i]) ||
pages[i]->mapping != mapping) {
unlock_page(pages[i]);
- put_page(pages[i]);
+ for (; i < ret; i++)
+ put_page(pages[i]);
err = -EAGAIN;
goto out;
}
@@ -5058,25 +5059,28 @@ struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
static void check_buffer_tree_ref(struct extent_buffer *eb)
{
int refs;
- /* the ref bit is tricky. We have to make sure it is set
- * if we have the buffer dirty. Otherwise the
- * code to free a buffer can end up dropping a dirty
- * page
+ /*
+ * The TREE_REF bit is first set when the extent_buffer is added
+ * to the radix tree. It is also reset, if unset, when a new reference
+ * is created by find_extent_buffer.
*
- * Once the ref bit is set, it won't go away while the
- * buffer is dirty or in writeback, and it also won't
- * go away while we have the reference count on the
- * eb bumped.
+ * It is only cleared in two cases: freeing the last non-tree
+ * reference to the extent_buffer when its STALE bit is set or
+ * calling releasepage when the tree reference is the only reference.
*
- * We can't just set the ref bit without bumping the
- * ref on the eb because free_extent_buffer might
- * see the ref bit and try to clear it. If this happens
- * free_extent_buffer might end up dropping our original
- * ref by mistake and freeing the page before we are able
- * to add one more ref.
+ * In both cases, care is taken to ensure that the extent_buffer's
+ * pages are not under io. However, releasepage can be concurrently
+ * called with creating new references, which is prone to race
+ * conditions between the calls to check_buffer_tree_ref in those
+ * codepaths and clearing TREE_REF in try_release_extent_buffer.
*
- * So bump the ref count first, then set the bit. If someone
- * beat us to it, drop the ref we added.
+ * The actual lifetime of the extent_buffer in the radix tree is
+ * adequately protected by the refcount, but the TREE_REF bit and
+ * its corresponding reference are not. To protect against this
+ * class of races, we call check_buffer_tree_ref from the codepaths
+ * which trigger io after they set eb->io_pages. Note that once io is
+ * initiated, TREE_REF can no longer be cleared, so that is the
+ * moment at which any such race is best fixed.
*/
refs = atomic_read(&eb->refs);
if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
@@ -5527,6 +5531,11 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
eb->read_mirror = 0;
atomic_set(&eb->io_pages, num_reads);
+ /*
+ * It is possible for releasepage to clear the TREE_REF bit before we
+ * set io_pages. See check_buffer_tree_ref for a more detailed comment.
+ */
+ check_buffer_tree_ref(eb);
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 2c14312b05e8..b0d2c976587e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1533,7 +1533,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
}
static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
- size_t *write_bytes)
+ size_t *write_bytes, bool nowait)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_root *root = inode->root;
@@ -1541,27 +1541,43 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
u64 num_bytes;
int ret;
- if (!btrfs_drew_try_write_lock(&root->snapshot_lock))
+ if (!nowait && !btrfs_drew_try_write_lock(&root->snapshot_lock))
return -EAGAIN;
lockstart = round_down(pos, fs_info->sectorsize);
lockend = round_up(pos + *write_bytes,
fs_info->sectorsize) - 1;
+ num_bytes = lockend - lockstart + 1;
- btrfs_lock_and_flush_ordered_range(inode, lockstart,
- lockend, NULL);
+ if (nowait) {
+ struct btrfs_ordered_extent *ordered;
+
+ if (!try_lock_extent(&inode->io_tree, lockstart, lockend))
+ return -EAGAIN;
+
+ ordered = btrfs_lookup_ordered_range(inode, lockstart,
+ num_bytes);
+ if (ordered) {
+ btrfs_put_ordered_extent(ordered);
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+ } else {
+ btrfs_lock_and_flush_ordered_range(inode, lockstart,
+ lockend, NULL);
+ }
- num_bytes = lockend - lockstart + 1;
ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
NULL, NULL, NULL);
if (ret <= 0) {
ret = 0;
- btrfs_drew_write_unlock(&root->snapshot_lock);
+ if (!nowait)
+ btrfs_drew_write_unlock(&root->snapshot_lock);
} else {
*write_bytes = min_t(size_t, *write_bytes ,
num_bytes - pos + lockstart);
}
-
+out_unlock:
unlock_extent(&inode->io_tree, lockstart, lockend);
return ret;
@@ -1633,7 +1649,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
BTRFS_INODE_PREALLOC)) &&
check_can_nocow(BTRFS_I(inode), pos,
- &write_bytes) > 0) {
+ &write_bytes, false) > 0) {
/*
* For nodata cow case, no need to reserve
* data space.
@@ -1904,13 +1920,25 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
pos = iocb->ki_pos;
count = iov_iter_count(from);
if (iocb->ki_flags & IOCB_NOWAIT) {
+ size_t nocow_bytes = count;
+
/*
* We will allocate space in case nodatacow is not set,
* so bail
*/
if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
BTRFS_INODE_PREALLOC)) ||
- check_can_nocow(BTRFS_I(inode), pos, &count) <= 0) {
+ check_can_nocow(BTRFS_I(inode), pos, &nocow_bytes,
+ true) <= 0) {
+ inode_unlock(inode);
+ return -EAGAIN;
+ }
+ /*
+ * There are holes in the range or parts of the range that must
+ * be COWed (shared extents, RO block groups, etc), so just bail
+ * out.
+ */
+ if (nocow_bytes < count) {
inode_unlock(inode);
return -EAGAIN;
}
@@ -3481,6 +3509,7 @@ const struct file_operations btrfs_file_operations = {
.read_iter = generic_file_read_iter,
.splice_read = generic_file_splice_read,
.write_iter = btrfs_file_write_iter,
+ .splice_write = iter_file_splice_write,
.mmap = btrfs_file_mmap,
.open = btrfs_file_open,
.release = btrfs_release_file,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d04c82c88418..6862cd7e21a9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -985,6 +985,7 @@ static noinline int cow_file_range(struct inode *inode,
u64 num_bytes;
unsigned long ram_size;
u64 cur_alloc_size = 0;
+ u64 min_alloc_size;
u64 blocksize = fs_info->sectorsize;
struct btrfs_key ins;
struct extent_map *em;
@@ -1035,10 +1036,26 @@ static noinline int cow_file_range(struct inode *inode,
btrfs_drop_extent_cache(BTRFS_I(inode), start,
start + num_bytes - 1, 0);
+ /*
+ * Relocation relies on the relocated extents to have exactly the same
+ * size as the original extents. Normally writeback for relocation data
+ * extents follows a NOCOW path because relocation preallocates the
+ * extents. However, due to an operation such as scrub turning a block
+ * group to RO mode, it may fallback to COW mode, so we must make sure
+ * an extent allocated during COW has exactly the requested size and can
+ * not be split into smaller extents, otherwise relocation breaks and
+ * fails during the stage where it updates the bytenr of file extent
+ * items.
+ */
+ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ min_alloc_size = num_bytes;
+ else
+ min_alloc_size = fs_info->sectorsize;
+
while (num_bytes > 0) {
cur_alloc_size = num_bytes;
ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
- fs_info->sectorsize, 0, alloc_hint,
+ min_alloc_size, 0, alloc_hint,
&ins, 1, 1);
if (ret < 0)
goto out_unlock;
@@ -1361,6 +1378,8 @@ static int fallback_to_cow(struct inode *inode, struct page *locked_page,
int *page_started, unsigned long *nr_written)
{
const bool is_space_ino = btrfs_is_free_space_inode(BTRFS_I(inode));
+ const bool is_reloc_ino = (BTRFS_I(inode)->root->root_key.objectid ==
+ BTRFS_DATA_RELOC_TREE_OBJECTID);
const u64 range_bytes = end + 1 - start;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
u64 range_start = start;
@@ -1391,18 +1410,23 @@ static int fallback_to_cow(struct inode *inode, struct page *locked_page,
* data space info, which we incremented in the step above.
*
* If we need to fallback to cow and the inode corresponds to a free
- * space cache inode, we must also increment bytes_may_use of the data
- * space_info for the same reason. Space caches always get a prealloc
+ * space cache inode or an inode of the data relocation tree, we must
+ * also increment bytes_may_use of the data space_info for the same
+ * reason. Space caches and relocated data extents always get a prealloc
* extent for them, however scrub or balance may have set the block
- * group that contains that extent to RO mode.
+ * group that contains that extent to RO mode and therefore force COW
+ * when starting writeback.
*/
count = count_range_bits(io_tree, &range_start, end, range_bytes,
EXTENT_NORESERVE, 0);
- if (count > 0 || is_space_ino) {
- const u64 bytes = is_space_ino ? range_bytes : count;
+ if (count > 0 || is_space_ino || is_reloc_ino) {
+ u64 bytes = count;
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct btrfs_space_info *sinfo = fs_info->data_sinfo;
+ if (is_space_ino || is_reloc_ino)
+ bytes = range_bytes;
+
spin_lock(&sinfo->lock);
btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
spin_unlock(&sinfo->lock);
@@ -1666,12 +1690,8 @@ out_check:
ret = fallback_to_cow(inode, locked_page, cow_start,
found_key.offset - 1,
page_started, nr_written);
- if (ret) {
- if (nocow)
- btrfs_dec_nocow_writers(fs_info,
- disk_bytenr);
+ if (ret)
goto error;
- }
cow_start = (u64)-1;
}
@@ -1687,9 +1707,6 @@ out_check:
ram_bytes, BTRFS_COMPRESS_NONE,
BTRFS_ORDERED_PREALLOC);
if (IS_ERR(em)) {
- if (nocow)
- btrfs_dec_nocow_writers(fs_info,
- disk_bytenr);
ret = PTR_ERR(em);
goto error;
}
@@ -7865,9 +7882,6 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
dio_data.overwrite = 1;
inode_unlock(inode);
relock = true;
- } else if (iocb->ki_flags & IOCB_NOWAIT) {
- ret = -EAGAIN;
- goto out;
}
ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
offset, count);
@@ -8109,20 +8123,17 @@ again:
/*
* Qgroup reserved space handler
* Page here will be either
- * 1) Already written to disk
- * In this case, its reserved space is released from data rsv map
- * and will be freed by delayed_ref handler finally.
- * So even we call qgroup_free_data(), it won't decrease reserved
- * space.
- * 2) Not written to disk
- * This means the reserved space should be freed here. However,
- * if a truncate invalidates the page (by clearing PageDirty)
- * and the page is accounted for while allocating extent
- * in btrfs_check_data_free_space() we let delayed_ref to
- * free the entire extent.
+ * 1) Already written to disk or ordered extent already submitted
+ * Then its QGROUP_RESERVED bit in io_tree is already cleaned.
+ * Qgroup will be handled by its qgroup_record then.
+ * btrfs_qgroup_free_data() call will do nothing here.
+ *
+ * 2) Not written to disk yet
+ * Then btrfs_qgroup_free_data() call will clear the QGROUP_RESERVED
+ * bit of its io_tree, and free the qgroup reserved data space.
+ * Since the IO will never happen for this page.
*/
- if (PageDirty(page))
- btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
+ btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
if (!inode_evicting) {
clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED |
EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 168deb8ef68a..e8f7c5f00894 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2692,7 +2692,7 @@ out:
btrfs_put_root(root);
out_free:
btrfs_free_path(path);
- kzfree(subvol_info);
+ kfree(subvol_info);
return ret;
}
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 7887317033c9..af92525dbb16 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -509,7 +509,7 @@ static int process_leaf(struct btrfs_root *root,
switch (key.type) {
case BTRFS_EXTENT_ITEM_KEY:
*num_bytes = key.offset;
- /* fall through */
+ fallthrough;
case BTRFS_METADATA_ITEM_KEY:
*bytenr = key.objectid;
ret = process_extent_item(fs_info, path, &key, i,
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 41ee88633769..c7bd3fdd7792 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -879,8 +879,8 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
return false;
}
global_rsv->reserved -= ticket->bytes;
+ remove_ticket(space_info, ticket);
ticket->bytes = 0;
- list_del_init(&ticket->list);
wake_up(&ticket->wait);
space_info->tickets_id++;
if (global_rsv->reserved < global_rsv->size)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index bc73fd670702..c3826ae883f0 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -523,7 +523,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
case Opt_compress_force:
case Opt_compress_force_type:
compress_force = true;
- /* Fallthrough */
+ fallthrough;
case Opt_compress:
case Opt_compress_type:
saved_compress_type = btrfs_test_opt(info,
@@ -622,7 +622,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
btrfs_set_opt(info->mount_opt, NOSSD);
btrfs_clear_and_info(info, SSD,
"not using ssd optimizations");
- /* Fallthrough */
+ fallthrough;
case Opt_nossd_spread:
btrfs_clear_and_info(info, SSD_SPREAD,
"not using spread ssd allocation scheme");
@@ -793,7 +793,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
case Opt_recovery:
btrfs_warn(info,
"'recovery' is deprecated, use 'usebackuproot' instead");
- /* fall through */
+ fallthrough;
case Opt_usebackuproot:
btrfs_info(info,
"trying to use backup root at mount time");
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 920cee312f4e..cd5348f352dd 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -169,6 +169,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
if (ret)
goto out;
+ set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
root->log_start_pid = current->pid;
}
@@ -195,6 +196,9 @@ static int join_running_log_trans(struct btrfs_root *root)
{
int ret = -ENOENT;
+ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
+ return ret;
+
mutex_lock(&root->log_mutex);
if (root->log_root) {
ret = 0;
@@ -3303,6 +3307,7 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
if (root->log_root) {
free_log_tree(trans, root->log_root);
root->log_root = NULL;
+ clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
}
return 0;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0d6e785bcb98..f403fb1e6d37 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -7052,6 +7052,14 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
mutex_lock(&fs_info->chunk_mutex);
/*
+ * It is possible for mount and umount to race in such a way that
+ * we execute this code path, but open_fs_devices failed to clear
+ * total_rw_bytes. We certainly want it cleared before reading the
+ * device items, so clear it here.
+ */
+ fs_info->fs_devices->total_rw_bytes = 0;
+
+ /*
* Read all device items, and then all the chunk items. All
* device items are found before any chunk item (their object id
* is smaller than the lowest possible object id for a chunk
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index f067b5934c46..75af2334b2e3 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -408,7 +408,7 @@ static inline enum btrfs_map_op btrfs_op(struct bio *bio)
return BTRFS_MAP_WRITE;
default:
WARN_ON_ONCE(1);
- /* fall through */
+ fallthrough;
case REQ_OP_READ:
return BTRFS_MAP_READ;
}
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index e7726f5f1241..3080cda9e824 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -937,7 +937,7 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
}
data = kmap(page);
- ret = __kernel_write(file, data, len, &pos);
+ ret = kernel_write(file, data, len, &pos);
kunmap(page);
fput(file);
if (ret != len)
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index fc98b97b396a..53588d7517b4 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -399,6 +399,10 @@ skip_rdma:
if (ses->sign)
seq_puts(m, " signed");
+ seq_printf(m, "\n\tUser: %d Cred User: %d",
+ from_kuid(&init_user_ns, ses->linux_uid),
+ from_kuid(&init_user_ns, ses->cred_uid));
+
if (ses->chan_count > 1) {
seq_printf(m, "\n\n\tExtra Channels: %zu\n",
ses->chan_count-1);
@@ -406,7 +410,7 @@ skip_rdma:
cifs_dump_channel(m, j, &ses->chans[j]);
}
- seq_puts(m, "\n\tShares:");
+ seq_puts(m, "\n\n\tShares:");
j = 0;
seq_printf(m, "\n\t%d) IPC: ", j);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index c7a311d28d3d..99b3180c613a 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -156,5 +156,5 @@ extern int cifs_truncate_page(struct address_space *mapping, loff_t from);
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.27"
+#define CIFS_VERSION "2.28"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 5fac34f192af..a61abde09ffe 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -5306,9 +5306,15 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
vol_info->nocase = master_tcon->nocase;
vol_info->nohandlecache = master_tcon->nohandlecache;
vol_info->local_lease = master_tcon->local_lease;
+ vol_info->no_lease = master_tcon->no_lease;
+ vol_info->resilient = master_tcon->use_resilient;
+ vol_info->persistent = master_tcon->use_persistent;
+ vol_info->handle_timeout = master_tcon->handle_timeout;
vol_info->no_linux_ext = !master_tcon->unix_ext;
+ vol_info->linux_ext = master_tcon->posix_extensions;
vol_info->sectype = master_tcon->ses->sectype;
vol_info->sign = master_tcon->ses->sign;
+ vol_info->seal = master_tcon->seal;
rc = cifs_set_vol_auth(vol_info, master_tcon->ses);
if (rc) {
@@ -5334,10 +5340,6 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
goto out;
}
- /* if new SMB3.11 POSIX extensions are supported do not remap / and \ */
- if (tcon->posix_extensions)
- cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS;
-
if (cap_unix(ses))
reset_cifs_unix_caps(0, tcon, NULL, vol_info);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 4fe757cfc360..be46fab4c96d 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1149,20 +1149,20 @@ cifs_posix_lock_test(struct file *file, struct file_lock *flock)
/*
* Set the byte-range lock (posix style). Returns:
- * 1) 0, if we set the lock and don't need to request to the server;
- * 2) 1, if we need to request to the server;
- * 3) <0, if the error occurs while setting the lock.
+ * 1) <0, if the error occurs while setting the lock;
+ * 2) 0, if we set the lock and don't need to request to the server;
+ * 3) FILE_LOCK_DEFERRED, if we will wait for some other file_lock;
+ * 4) FILE_LOCK_DEFERRED + 1, if we need to request to the server.
*/
static int
cifs_posix_lock_set(struct file *file, struct file_lock *flock)
{
struct cifsInodeInfo *cinode = CIFS_I(file_inode(file));
- int rc = 1;
+ int rc = FILE_LOCK_DEFERRED + 1;
if ((flock->fl_flags & FL_POSIX) == 0)
return rc;
-try_again:
cifs_down_write(&cinode->lock_sem);
if (!cinode->can_cache_brlcks) {
up_write(&cinode->lock_sem);
@@ -1171,13 +1171,6 @@ try_again:
rc = posix_lock_file(file, flock, NULL);
up_write(&cinode->lock_sem);
- if (rc == FILE_LOCK_DEFERRED) {
- rc = wait_event_interruptible(flock->fl_wait,
- list_empty(&flock->fl_blocked_member));
- if (!rc)
- goto try_again;
- locks_delete_block(flock);
- }
return rc;
}
@@ -1652,7 +1645,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
int posix_lock_type;
rc = cifs_posix_lock_set(file, flock);
- if (!rc || rc < 0)
+ if (rc <= FILE_LOCK_DEFERRED)
return rc;
if (type & server->vals->shared_lock_type)
@@ -4336,7 +4329,8 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
break;
__SetPageLocked(page);
- if (add_to_page_cache_locked(page, mapping, page->index, gfp)) {
+ rc = add_to_page_cache_locked(page, mapping, page->index, gfp);
+ if (rc) {
__ClearPageLocked(page);
break;
}
@@ -4352,6 +4346,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
struct list_head *page_list, unsigned num_pages)
{
int rc;
+ int err = 0;
struct list_head tmplist;
struct cifsFileInfo *open_file = file->private_data;
struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file);
@@ -4396,7 +4391,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
* the order of declining indexes. When we put the pages in
* the rdata->pages, then we want them in increasing order.
*/
- while (!list_empty(page_list)) {
+ while (!list_empty(page_list) && !err) {
unsigned int i, nr_pages, bytes, rsize;
loff_t offset;
struct page *page, *tpage;
@@ -4429,9 +4424,10 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
return 0;
}
- rc = readpages_get_pages(mapping, page_list, rsize, &tmplist,
+ nr_pages = 0;
+ err = readpages_get_pages(mapping, page_list, rsize, &tmplist,
&nr_pages, &offset, &bytes);
- if (rc) {
+ if (!nr_pages) {
add_credits_and_wake_if(server, credits, 0);
break;
}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 583f5e4008c2..ce95801e9b66 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -2535,6 +2535,15 @@ set_size_out:
if (rc == 0) {
cifsInode->server_eof = attrs->ia_size;
cifs_setsize(inode, attrs->ia_size);
+
+ /*
+ * The man page of truncate says if the size changed,
+ * then the st_ctime and st_mtime fields for the file
+ * are updated.
+ */
+ attrs->ia_ctime = attrs->ia_mtime = current_time(inode);
+ attrs->ia_valid |= ATTR_CTIME | ATTR_MTIME;
+
cifs_truncate_page(inode->i_mapping, inode->i_size);
}
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 4a73e63c4d43..dcde44ff6cf9 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -169,6 +169,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
unsigned int xid;
struct cifsFileInfo *pSMBFile = filep->private_data;
struct cifs_tcon *tcon;
+ struct tcon_link *tlink;
struct cifs_sb_info *cifs_sb;
__u64 ExtAttrBits = 0;
__u64 caps;
@@ -307,13 +308,19 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
break;
}
cifs_sb = CIFS_SB(inode->i_sb);
- tcon = tlink_tcon(cifs_sb_tlink(cifs_sb));
+ tlink = cifs_sb_tlink(cifs_sb);
+ if (IS_ERR(tlink)) {
+ rc = PTR_ERR(tlink);
+ break;
+ }
+ tcon = tlink_tcon(tlink);
if (tcon && tcon->ses->server->ops->notify) {
rc = tcon->ses->server->ops->notify(xid,
filep, (void __user *)arg);
cifs_dbg(FYI, "ioctl notify rc %d\n", rc);
} else
rc = -EOPNOTSUPP;
+ cifs_put_tlink(tlink);
break;
default:
cifs_dbg(FYI, "unsupported ioctl\n");
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 56791a692c8b..e44d049142d0 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -844,28 +844,26 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw)
struct bio_vec *bv = NULL;
if (iov_iter_is_kvec(iter)) {
- memcpy(&ctx->iter, iter, sizeof(struct iov_iter));
+ memcpy(&ctx->iter, iter, sizeof(*iter));
ctx->len = count;
iov_iter_advance(iter, count);
return 0;
}
- if (max_pages * sizeof(struct bio_vec) <= CIFS_AIO_KMALLOC_LIMIT)
- bv = kmalloc_array(max_pages, sizeof(struct bio_vec),
- GFP_KERNEL);
+ if (array_size(max_pages, sizeof(*bv)) <= CIFS_AIO_KMALLOC_LIMIT)
+ bv = kmalloc_array(max_pages, sizeof(*bv), GFP_KERNEL);
if (!bv) {
- bv = vmalloc(array_size(max_pages, sizeof(struct bio_vec)));
+ bv = vmalloc(array_size(max_pages, sizeof(*bv)));
if (!bv)
return -ENOMEM;
}
- if (max_pages * sizeof(struct page *) <= CIFS_AIO_KMALLOC_LIMIT)
- pages = kmalloc_array(max_pages, sizeof(struct page *),
- GFP_KERNEL);
+ if (array_size(max_pages, sizeof(*pages)) <= CIFS_AIO_KMALLOC_LIMIT)
+ pages = kmalloc_array(max_pages, sizeof(*pages), GFP_KERNEL);
if (!pages) {
- pages = vmalloc(array_size(max_pages, sizeof(struct page *)));
+ pages = vmalloc(array_size(max_pages, sizeof(*pages)));
if (!pages) {
kvfree(bv);
return -ENOMEM;
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 6a39451973f8..157992864ce7 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -354,9 +354,13 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_sync_hdr *shdr)
((struct smb2_ioctl_rsp *)shdr)->OutputCount);
break;
case SMB2_CHANGE_NOTIFY:
+ *off = le16_to_cpu(
+ ((struct smb2_change_notify_rsp *)shdr)->OutputBufferOffset);
+ *len = le32_to_cpu(
+ ((struct smb2_change_notify_rsp *)shdr)->OutputBufferLength);
+ break;
default:
- /* BB FIXME for unimplemented cases above */
- cifs_dbg(VFS, "no length check for command\n");
+ cifs_dbg(VFS, "no length check for command %d\n", le16_to_cpu(shdr->Command));
break;
}
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 736d86b8a910..32f90dc82c84 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -763,6 +763,7 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon,
/* close extra handle outside of crit sec */
SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
}
+ rc = 0;
goto oshr_free;
}
@@ -2147,7 +2148,7 @@ smb3_notify(const unsigned int xid, struct file *pfile,
tcon = cifs_sb_master_tcon(cifs_sb);
oparms.tcon = tcon;
- oparms.desired_access = FILE_READ_ATTRIBUTES;
+ oparms.desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA;
oparms.disposition = FILE_OPEN;
oparms.create_options = cifs_create_options(cifs_sb, 0);
oparms.fid = &fid;
@@ -3187,6 +3188,11 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
trace_smb3_zero_enter(xid, cfile->fid.persistent_fid, tcon->tid,
ses->Suid, offset, len);
+ /*
+ * We zero the range through ioctl, so we need remove the page caches
+ * first, otherwise the data may be inconsistent with the server.
+ */
+ truncate_pagecache_range(inode, offset, offset + len - 1);
/* if file not oplocked can't be sure whether asking to extend size */
if (!CIFS_CACHE_READ(cifsi))
@@ -3253,6 +3259,12 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
return rc;
}
+ /*
+ * We implement the punch hole through ioctl, so we need remove the page
+ * caches first, otherwise the data may be inconsistent with the server.
+ */
+ truncate_pagecache_range(inode, offset, offset + len - 1);
+
cifs_dbg(FYI, "Offset %lld len %lld\n", offset, len);
fsctl_buf.FileOffset = cpu_to_le64(offset);
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index d11e31064679..84433d0653f9 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -523,7 +523,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits,
const int timeout, const int flags,
unsigned int *instance)
{
- int rc;
+ long rc;
int *credits;
int optype;
long int t;
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index e9e27a271af0..feaa5e182b7b 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -51,6 +51,7 @@ static ssize_t efivarfs_file_write(struct file *file,
} else {
inode_lock(inode);
i_size_write(inode, datasize + sizeof(attributes));
+ inode->i_mtime = current_time(inode);
inode_unlock(inode);
}
@@ -72,10 +73,8 @@ static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf,
ssize_t size = 0;
int err;
- while (!__ratelimit(&file->f_cred->user->ratelimit)) {
- if (!msleep_interruptible(50))
- return -EINTR;
- }
+ while (!__ratelimit(&file->f_cred->user->ratelimit))
+ msleep(50);
err = efivar_entry_size(var, &datasize);
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 12c66f5d92dd..28bb5689333a 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -201,6 +201,9 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_d_op = &efivarfs_d_ops;
sb->s_time_gran = 1;
+ if (!efivar_supports_writes())
+ sb->s_flags |= SB_RDONLY;
+
inode = efivarfs_get_inode(sb, NULL, S_IFDIR | 0755, 0, true);
if (!inode)
return -ENOMEM;
@@ -252,9 +255,6 @@ static struct file_system_type efivarfs_type = {
static __init int efivarfs_init(void)
{
- if (!efi_rt_services_supported(EFI_RT_SUPPORTED_VARIABLE_SERVICES))
- return -ENODEV;
-
if (!efivars_kobject())
return -ENODEV;
diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
index 7824f5563a55..9b66c28b3ae9 100644
--- a/fs/erofs/zdata.h
+++ b/fs/erofs/zdata.h
@@ -144,22 +144,22 @@ static inline void z_erofs_onlinepage_init(struct page *page)
static inline void z_erofs_onlinepage_fixup(struct page *page,
uintptr_t index, bool down)
{
- unsigned long *p, o, v, id;
-repeat:
- p = &page_private(page);
- o = READ_ONCE(*p);
+ union z_erofs_onlinepage_converter u = { .v = &page_private(page) };
+ int orig, orig_index, val;
- id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
- if (id) {
+repeat:
+ orig = atomic_read(u.o);
+ orig_index = orig >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+ if (orig_index) {
if (!index)
return;
- DBG_BUGON(id != index);
+ DBG_BUGON(orig_index != index);
}
- v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
- ((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned int)down);
- if (cmpxchg(p, o, v) != o)
+ val = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
+ ((orig & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned int)down);
+ if (atomic_cmpxchg(u.o, orig, val) != orig)
goto repeat;
}
diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index de43534aa299..119abf0d8dd6 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -309,7 +309,7 @@ const struct file_operations exfat_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
.iterate = exfat_iterate,
- .fsync = generic_file_fsync,
+ .fsync = exfat_file_fsync,
};
int exfat_alloc_new_dir(struct inode *inode, struct exfat_chain *clu)
@@ -425,10 +425,12 @@ static void exfat_init_name_entry(struct exfat_dentry *ep,
ep->dentry.name.flags = 0x0;
for (i = 0; i < EXFAT_FILE_NAME_LEN; i++) {
- ep->dentry.name.unicode_0_14[i] = cpu_to_le16(*uniname);
- if (*uniname == 0x0)
- break;
- uniname++;
+ if (*uniname != 0x0) {
+ ep->dentry.name.unicode_0_14[i] = cpu_to_le16(*uniname);
+ uniname++;
+ } else {
+ ep->dentry.name.unicode_0_14[i] = 0x0;
+ }
}
}
@@ -1110,7 +1112,7 @@ found:
ret = exfat_get_next_cluster(sb, &clu.dir);
}
- if (ret || clu.dir != EXFAT_EOF_CLUSTER) {
+ if (ret || clu.dir == EXFAT_EOF_CLUSTER) {
/* just initialized hint_stat */
hint_stat->clu = p_dir->dir;
hint_stat->eidx = 0;
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index 595f3117f492..75c7bdbeba6d 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -371,7 +371,7 @@ static inline bool exfat_is_last_sector_in_cluster(struct exfat_sb_info *sbi,
static inline sector_t exfat_cluster_to_sector(struct exfat_sb_info *sbi,
unsigned int clus)
{
- return ((clus - EXFAT_RESERVED_CLUSTERS) << sbi->sect_per_clus_bits) +
+ return ((sector_t)(clus - EXFAT_RESERVED_CLUSTERS) << sbi->sect_per_clus_bits) +
sbi->data_start_sector;
}
@@ -420,6 +420,7 @@ void exfat_truncate(struct inode *inode, loff_t size);
int exfat_setattr(struct dentry *dentry, struct iattr *attr);
int exfat_getattr(const struct path *path, struct kstat *stat,
unsigned int request_mask, unsigned int query_flags);
+int exfat_file_fsync(struct file *file, loff_t start, loff_t end, int datasync);
/* namei.c */
extern const struct dentry_operations exfat_dentry_ops;
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index fce03f318787..a6a063830edc 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -6,6 +6,7 @@
#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
#include "exfat_raw.h"
#include "exfat_fs.h"
@@ -175,7 +176,7 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
ep2->dentry.stream.size = 0;
} else {
ep2->dentry.stream.valid_size = cpu_to_le64(new_size);
- ep2->dentry.stream.size = ep->dentry.stream.valid_size;
+ ep2->dentry.stream.size = ep2->dentry.stream.valid_size;
}
if (new_size == 0) {
@@ -346,12 +347,28 @@ out:
return error;
}
+int exfat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
+{
+ struct inode *inode = filp->f_mapping->host;
+ int err;
+
+ err = __generic_file_fsync(filp, start, end, datasync);
+ if (err)
+ return err;
+
+ err = sync_blockdev(inode->i_sb->s_bdev);
+ if (err)
+ return err;
+
+ return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+}
+
const struct file_operations exfat_file_operations = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
- .fsync = generic_file_fsync,
+ .fsync = exfat_file_fsync,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
};
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 5b0f35329d63..2b9e21094a96 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -975,7 +975,6 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
goto unlock;
}
- exfat_set_vol_flags(sb, VOL_DIRTY);
exfat_chain_set(&clu_to_free, ei->start_clu,
EXFAT_B_TO_CLU_ROUND_UP(i_size_read(inode), sbi), ei->flags);
@@ -1002,6 +1001,7 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
num_entries++;
brelse(bh);
+ exfat_set_vol_flags(sb, VOL_DIRTY);
err = exfat_remove_entries(dir, &cdir, entry, 0, num_entries);
if (err) {
exfat_err(sb, "failed to exfat_remove_entries : err(%d)", err);
@@ -1077,10 +1077,14 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir,
epold = exfat_get_dentry(sb, p_dir, oldentry + 1, &old_bh,
&sector_old);
+ if (!epold)
+ return -EIO;
epnew = exfat_get_dentry(sb, p_dir, newentry + 1, &new_bh,
&sector_new);
- if (!epold || !epnew)
+ if (!epnew) {
+ brelse(old_bh);
return -EIO;
+ }
memcpy(epnew, epold, DENTRY_SIZE);
exfat_update_bh(sb, new_bh, sync);
@@ -1161,10 +1165,14 @@ static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir,
epmov = exfat_get_dentry(sb, p_olddir, oldentry + 1, &mov_bh,
&sector_mov);
+ if (!epmov)
+ return -EIO;
epnew = exfat_get_dentry(sb, p_newdir, newentry + 1, &new_bh,
&sector_new);
- if (!epmov || !epnew)
+ if (!epnew) {
+ brelse(mov_bh);
return -EIO;
+ }
memcpy(epnew, epmov, DENTRY_SIZE);
exfat_update_bh(sb, new_bh, IS_DIRSYNC(inode));
diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c
index 57b5a7a4d1f7..a3c927501e67 100644
--- a/fs/exfat/nls.c
+++ b/fs/exfat/nls.c
@@ -495,7 +495,7 @@ static int exfat_utf8_to_utf16(struct super_block *sb,
struct exfat_uni_name *p_uniname, int *p_lossy)
{
int i, unilen, lossy = NLS_NAME_NO_LOSSY;
- unsigned short upname[MAX_NAME_LENGTH + 1];
+ __le16 upname[MAX_NAME_LENGTH + 1];
unsigned short *uniname = p_uniname->name;
WARN_ON(!len);
@@ -519,7 +519,7 @@ static int exfat_utf8_to_utf16(struct super_block *sb,
exfat_wstrchr(bad_uni_chars, *uniname))
lossy |= NLS_NAME_LOSSY;
- upname[i] = exfat_toupper(sb, *uniname);
+ upname[i] = cpu_to_le16(exfat_toupper(sb, *uniname));
uniname++;
}
@@ -597,7 +597,7 @@ static int exfat_nls_to_ucs2(struct super_block *sb,
struct exfat_uni_name *p_uniname, int *p_lossy)
{
int i = 0, unilen = 0, lossy = NLS_NAME_NO_LOSSY;
- unsigned short upname[MAX_NAME_LENGTH + 1];
+ __le16 upname[MAX_NAME_LENGTH + 1];
unsigned short *uniname = p_uniname->name;
struct nls_table *nls = EXFAT_SB(sb)->nls_io;
@@ -611,7 +611,7 @@ static int exfat_nls_to_ucs2(struct super_block *sb,
exfat_wstrchr(bad_uni_chars, *uniname))
lossy |= NLS_NAME_LOSSY;
- upname[unilen] = exfat_toupper(sb, *uniname);
+ upname[unilen] = cpu_to_le16(exfat_toupper(sb, *uniname));
uniname++;
unilen++;
}
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index e650e65536f8..253a92460d52 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -693,10 +693,20 @@ static void exfat_free(struct fs_context *fc)
}
}
+static int exfat_reconfigure(struct fs_context *fc)
+{
+ fc->sb_flags |= SB_NODIRATIME;
+
+ /* volume flag will be updated in exfat_sync_fs */
+ sync_filesystem(fc->root->d_sb);
+ return 0;
+}
+
static const struct fs_context_operations exfat_context_ops = {
.parse_param = exfat_parse_param,
.get_tree = exfat_get_tree,
.free = exfat_free,
+ .reconfigure = exfat_reconfigure,
};
static int exfat_init_fs_context(struct fs_context *fc)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index e573b0cd2737..83d917f7e542 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -18,6 +18,7 @@
#include <linux/swap.h>
#include <linux/falloc.h>
#include <linux/uio.h>
+#include <linux/fs.h>
static struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags,
struct fuse_page_desc **desc)
@@ -1586,7 +1587,6 @@ static void fuse_writepage_finish(struct fuse_conn *fc,
struct backing_dev_info *bdi = inode_to_bdi(inode);
int i;
- rb_erase(&wpa->writepages_entry, &fi->writepages);
for (i = 0; i < ap->num_pages; i++) {
dec_wb_stat(&bdi->wb, WB_WRITEBACK);
dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP);
@@ -1637,6 +1637,7 @@ __acquires(fi->lock)
out_free:
fi->writectr--;
+ rb_erase(&wpa->writepages_entry, &fi->writepages);
fuse_writepage_finish(fc, wpa);
spin_unlock(&fi->lock);
@@ -1674,7 +1675,8 @@ __acquires(fi->lock)
}
}
-static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa)
+static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root,
+ struct fuse_writepage_args *wpa)
{
pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT;
pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1;
@@ -1697,11 +1699,17 @@ static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa)
else if (idx_to < curr_index)
p = &(*p)->rb_left;
else
- return (void) WARN_ON(true);
+ return curr;
}
rb_link_node(&wpa->writepages_entry, parent, p);
rb_insert_color(&wpa->writepages_entry, root);
+ return NULL;
+}
+
+static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa)
+{
+ WARN_ON(fuse_insert_writeback(root, wpa));
}
static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args,
@@ -1714,6 +1722,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args,
mapping_set_error(inode->i_mapping, error);
spin_lock(&fi->lock);
+ rb_erase(&wpa->writepages_entry, &fi->writepages);
while (wpa->next) {
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_write_in *inarg = &wpa->ia.write.in;
@@ -1952,14 +1961,14 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data)
}
/*
- * First recheck under fi->lock if the offending offset is still under
- * writeback. If yes, then iterate auxiliary write requests, to see if there's
+ * Check under fi->lock if the page is under writeback, and insert it onto the
+ * rb_tree if not. Otherwise iterate auxiliary write requests, to see if there's
* one already added for a page at this offset. If there's none, then insert
* this new request onto the auxiliary list, otherwise reuse the existing one by
- * copying the new page contents over to the old temporary page.
+ * swapping the new temp page with the old one.
*/
-static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa,
- struct page *page)
+static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa,
+ struct page *page)
{
struct fuse_inode *fi = get_fuse_inode(new_wpa->inode);
struct fuse_writepage_args *tmp;
@@ -1967,17 +1976,15 @@ static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa,
struct fuse_args_pages *new_ap = &new_wpa->ia.ap;
WARN_ON(new_ap->num_pages != 0);
+ new_ap->num_pages = 1;
spin_lock(&fi->lock);
- rb_erase(&new_wpa->writepages_entry, &fi->writepages);
- old_wpa = fuse_find_writeback(fi, page->index, page->index);
+ old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa);
if (!old_wpa) {
- tree_insert(&fi->writepages, new_wpa);
spin_unlock(&fi->lock);
- return false;
+ return true;
}
- new_ap->num_pages = 1;
for (tmp = old_wpa->next; tmp; tmp = tmp->next) {
pgoff_t curr_index;
@@ -2006,7 +2013,41 @@ static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa,
fuse_writepage_free(new_wpa);
}
- return true;
+ return false;
+}
+
+static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page,
+ struct fuse_args_pages *ap,
+ struct fuse_fill_wb_data *data)
+{
+ WARN_ON(!ap->num_pages);
+
+ /*
+ * Being under writeback is unlikely but possible. For example direct
+ * read to an mmaped fuse file will set the page dirty twice; once when
+ * the pages are faulted with get_user_pages(), and then after the read
+ * completed.
+ */
+ if (fuse_page_is_writeback(data->inode, page->index))
+ return true;
+
+ /* Reached max pages */
+ if (ap->num_pages == fc->max_pages)
+ return true;
+
+ /* Reached max write bytes */
+ if ((ap->num_pages + 1) * PAGE_SIZE > fc->max_write)
+ return true;
+
+ /* Discontinuity */
+ if (data->orig_pages[ap->num_pages - 1]->index + 1 != page->index)
+ return true;
+
+ /* Need to grow the pages array? If so, did the expansion fail? */
+ if (ap->num_pages == data->max_pages && !fuse_pages_realloc(data))
+ return true;
+
+ return false;
}
static int fuse_writepages_fill(struct page *page,
@@ -2019,7 +2060,6 @@ static int fuse_writepages_fill(struct page *page,
struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_conn *fc = get_fuse_conn(inode);
struct page *tmp_page;
- bool is_writeback;
int err;
if (!data->ff) {
@@ -2029,25 +2069,9 @@ static int fuse_writepages_fill(struct page *page,
goto out_unlock;
}
- /*
- * Being under writeback is unlikely but possible. For example direct
- * read to an mmaped fuse file will set the page dirty twice; once when
- * the pages are faulted with get_user_pages(), and then after the read
- * completed.
- */
- is_writeback = fuse_page_is_writeback(inode, page->index);
-
- if (wpa && ap->num_pages &&
- (is_writeback || ap->num_pages == fc->max_pages ||
- (ap->num_pages + 1) * PAGE_SIZE > fc->max_write ||
- data->orig_pages[ap->num_pages - 1]->index + 1 != page->index)) {
+ if (wpa && fuse_writepage_need_send(fc, page, ap, data)) {
fuse_writepages_send(data);
data->wpa = NULL;
- } else if (wpa && ap->num_pages == data->max_pages) {
- if (!fuse_pages_realloc(data)) {
- fuse_writepages_send(data);
- data->wpa = NULL;
- }
}
err = -ENOMEM;
@@ -2085,12 +2109,6 @@ static int fuse_writepages_fill(struct page *page,
ap->args.end = fuse_writepage_end;
ap->num_pages = 0;
wpa->inode = inode;
-
- spin_lock(&fi->lock);
- tree_insert(&fi->writepages, wpa);
- spin_unlock(&fi->lock);
-
- data->wpa = wpa;
}
set_page_writeback(page);
@@ -2098,26 +2116,25 @@ static int fuse_writepages_fill(struct page *page,
ap->pages[ap->num_pages] = tmp_page;
ap->descs[ap->num_pages].offset = 0;
ap->descs[ap->num_pages].length = PAGE_SIZE;
+ data->orig_pages[ap->num_pages] = page;
inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
err = 0;
- if (is_writeback && fuse_writepage_in_flight(wpa, page)) {
+ if (data->wpa) {
+ /*
+ * Protected by fi->lock against concurrent access by
+ * fuse_page_is_writeback().
+ */
+ spin_lock(&fi->lock);
+ ap->num_pages++;
+ spin_unlock(&fi->lock);
+ } else if (fuse_writepage_add(wpa, page)) {
+ data->wpa = wpa;
+ } else {
end_page_writeback(page);
- data->wpa = NULL;
- goto out_unlock;
}
- data->orig_pages[ap->num_pages] = page;
-
- /*
- * Protected by fi->lock against concurrent access by
- * fuse_page_is_writeback().
- */
- spin_lock(&fi->lock);
- ap->num_pages++;
- spin_unlock(&fi->lock);
-
out_unlock:
unlock_page(page);
@@ -2149,10 +2166,8 @@ static int fuse_writepages(struct address_space *mapping,
err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
if (data.wpa) {
- /* Ignore errors if we can write at least one page */
WARN_ON(!data.wpa->ia.ap.num_pages);
fuse_writepages_send(&data);
- err = 0;
}
if (data.ff)
fuse_file_put(data.ff, false, false);
@@ -2761,7 +2776,16 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
struct iovec *iov = iov_page;
iov->iov_base = (void __user *)arg;
- iov->iov_len = _IOC_SIZE(cmd);
+
+ switch (cmd) {
+ case FS_IOC_GETFLAGS:
+ case FS_IOC_SETFLAGS:
+ iov->iov_len = sizeof(int);
+ break;
+ default:
+ iov->iov_len = _IOC_SIZE(cmd);
+ break;
+ }
if (_IOC_DIR(cmd) & _IOC_WRITE) {
in_iov = iov;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 5b4aebf5821f..bba747520e9b 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -121,10 +121,12 @@ static void fuse_evict_inode(struct inode *inode)
}
}
-static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
+static int fuse_reconfigure(struct fs_context *fc)
{
+ struct super_block *sb = fc->root->d_sb;
+
sync_filesystem(sb);
- if (*flags & SB_MANDLOCK)
+ if (fc->sb_flags & SB_MANDLOCK)
return -EINVAL;
return 0;
@@ -475,6 +477,17 @@ static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param)
struct fuse_fs_context *ctx = fc->fs_private;
int opt;
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ /*
+ * Ignore options coming from mount(MS_REMOUNT) for backward
+ * compatibility.
+ */
+ if (fc->oldapi)
+ return 0;
+
+ return invalfc(fc, "No changes allowed in reconfigure");
+ }
+
opt = fs_parse(fc, fuse_fs_parameters, param, &result);
if (opt < 0)
return opt;
@@ -817,7 +830,6 @@ static const struct super_operations fuse_super_operations = {
.evict_inode = fuse_evict_inode,
.write_inode = fuse_write_inode,
.drop_inode = generic_delete_inode,
- .remount_fs = fuse_remount_fs,
.put_super = fuse_put_super,
.umount_begin = fuse_umount_begin,
.statfs = fuse_statfs,
@@ -1296,6 +1308,7 @@ static int fuse_get_tree(struct fs_context *fc)
static const struct fs_context_operations fuse_context_ops = {
.free = fuse_free_fc,
.parse_param = fuse_parse_param,
+ .reconfigure = fuse_reconfigure,
.get_tree = fuse_get_tree,
};
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 72c9560f4467..68cd700a2719 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -468,21 +468,10 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
}
-/**
- * __gfs2_readpage - readpage
- * @file: The file to read a page for
- * @page: The page to read
- *
- * This is the core of gfs2's readpage. It's used by the internal file
- * reading code as in that case we already hold the glock. Also it's
- * called by gfs2_readpage() once the required lock has been granted.
- */
-
static int __gfs2_readpage(void *file, struct page *page)
{
struct gfs2_inode *ip = GFS2_I(page->mapping->host);
struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
-
int error;
if (i_blocksize(page->mapping->host) == PAGE_SIZE &&
@@ -505,36 +494,11 @@ static int __gfs2_readpage(void *file, struct page *page)
* gfs2_readpage - read a page of a file
* @file: The file to read
* @page: The page of the file
- *
- * This deals with the locking required. We have to unlock and
- * relock the page in order to get the locking in the right
- * order.
*/
static int gfs2_readpage(struct file *file, struct page *page)
{
- struct address_space *mapping = page->mapping;
- struct gfs2_inode *ip = GFS2_I(mapping->host);
- struct gfs2_holder gh;
- int error;
-
- unlock_page(page);
- gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
- error = gfs2_glock_nq(&gh);
- if (unlikely(error))
- goto out;
- error = AOP_TRUNCATED_PAGE;
- lock_page(page);
- if (page->mapping == mapping && !PageUptodate(page))
- error = __gfs2_readpage(file, page);
- else
- unlock_page(page);
- gfs2_glock_dq(&gh);
-out:
- gfs2_holder_uninit(&gh);
- if (error && error != AOP_TRUNCATED_PAGE)
- lock_page(page);
- return error;
+ return __gfs2_readpage(file, page);
}
/**
@@ -598,16 +562,9 @@ static void gfs2_readahead(struct readahead_control *rac)
{
struct inode *inode = rac->mapping->host;
struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_holder gh;
- gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
- if (gfs2_glock_nq(&gh))
- goto out_uninit;
if (!gfs2_is_stuffed(ip))
mpage_readahead(rac, gfs2_block_map);
- gfs2_glock_dq(&gh);
-out_uninit:
- gfs2_holder_uninit(&gh);
}
/**
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index fe305e4bfd37..bebde537ac8c 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -558,8 +558,29 @@ out_uninit:
return block_page_mkwrite_return(ret);
}
+static vm_fault_t gfs2_fault(struct vm_fault *vmf)
+{
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder gh;
+ vm_fault_t ret;
+ int err;
+
+ gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+ err = gfs2_glock_nq(&gh);
+ if (err) {
+ ret = block_page_mkwrite_return(err);
+ goto out_uninit;
+ }
+ ret = filemap_fault(vmf);
+ gfs2_glock_dq(&gh);
+out_uninit:
+ gfs2_holder_uninit(&gh);
+ return ret;
+}
+
static const struct vm_operations_struct gfs2_vm_ops = {
- .fault = filemap_fault,
+ .fault = gfs2_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = gfs2_page_mkwrite,
};
@@ -824,6 +845,9 @@ out_uninit:
static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
+ struct gfs2_inode *ip;
+ struct gfs2_holder gh;
+ size_t written = 0;
ssize_t ret;
if (iocb->ki_flags & IOCB_DIRECT) {
@@ -832,7 +856,31 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
return ret;
iocb->ki_flags &= ~IOCB_DIRECT;
}
- return generic_file_read_iter(iocb, to);
+ iocb->ki_flags |= IOCB_NOIO;
+ ret = generic_file_read_iter(iocb, to);
+ iocb->ki_flags &= ~IOCB_NOIO;
+ if (ret >= 0) {
+ if (!iov_iter_count(to))
+ return ret;
+ written = ret;
+ } else {
+ if (ret != -EAGAIN)
+ return ret;
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return ret;
+ }
+ ip = GFS2_I(iocb->ki_filp->f_mapping->host);
+ gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+ ret = gfs2_glock_nq(&gh);
+ if (ret)
+ goto out_uninit;
+ ret = generic_file_read_iter(iocb, to);
+ if (ret > 0)
+ written += ret;
+ gfs2_glock_dq(&gh);
+out_uninit:
+ gfs2_holder_uninit(&gh);
+ return written ? written : ret;
}
/**
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 2299dcc417ea..8545024a1401 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1899,7 +1899,10 @@ bool gfs2_delete_work_queued(const struct gfs2_glock *gl)
static void flush_delete_work(struct gfs2_glock *gl)
{
- flush_delayed_work(&gl->gl_delete);
+ if (cancel_delayed_work(&gl->gl_delete)) {
+ queue_delayed_work(gfs2_delete_workqueue,
+ &gl->gl_delete, 0);
+ }
gfs2_glock_queue_work(gl, 0);
}
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index c84887769b5a..de1d5f1d9ff8 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -531,8 +531,7 @@ static int freeze_go_sync(struct gfs2_glock *gl)
int error = 0;
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
- if (gl->gl_state == LM_ST_SHARED && !gfs2_withdrawn(sdp) &&
- test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+ if (gl->gl_req == LM_ST_EXCLUSIVE && !gfs2_withdrawn(sdp)) {
atomic_set(&sdp->sd_freeze_state, SFS_STARTING_FREEZE);
error = freeze_super(sdp->sd_vfs);
if (error) {
@@ -545,8 +544,11 @@ static int freeze_go_sync(struct gfs2_glock *gl)
gfs2_assert_withdraw(sdp, 0);
}
queue_work(gfs2_freeze_wq, &sdp->sd_freeze_work);
- gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE |
- GFS2_LFC_FREEZE_GO_SYNC);
+ if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
+ gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE |
+ GFS2_LFC_FREEZE_GO_SYNC);
+ else /* read-only mounts */
+ atomic_set(&sdp->sd_freeze_state, SFS_FROZEN);
}
return 0;
}
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 03ab11fab962..ca2ec02436ec 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -399,7 +399,6 @@ enum {
GIF_QD_LOCKED = 1,
GIF_ALLOC_FAILED = 2,
GIF_SW_PAGED = 3,
- GIF_ORDERED = 4,
GIF_FREE_VFS_INODE = 5,
GIF_GLOP_PENDING = 6,
GIF_DEFERRED_DELETE = 7,
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 370c3a4b31ac..6774865f5b5b 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -207,10 +207,11 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
if (no_formal_ino && ip->i_no_formal_ino &&
no_formal_ino != ip->i_no_formal_ino) {
+ error = -ESTALE;
if (inode->i_state & I_NEW)
goto fail;
iput(inode);
- return ERR_PTR(-ESTALE);
+ return ERR_PTR(error);
}
if (inode->i_state & I_NEW)
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 3e4734431783..a76e55bc28eb 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -613,6 +613,12 @@ static int ip_cmp(void *priv, struct list_head *a, struct list_head *b)
return 0;
}
+static void __ordered_del_inode(struct gfs2_inode *ip)
+{
+ if (!list_empty(&ip->i_ordered))
+ list_del_init(&ip->i_ordered);
+}
+
static void gfs2_ordered_write(struct gfs2_sbd *sdp)
{
struct gfs2_inode *ip;
@@ -623,8 +629,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp)
while (!list_empty(&sdp->sd_log_ordered)) {
ip = list_first_entry(&sdp->sd_log_ordered, struct gfs2_inode, i_ordered);
if (ip->i_inode.i_mapping->nrpages == 0) {
- test_and_clear_bit(GIF_ORDERED, &ip->i_flags);
- list_del(&ip->i_ordered);
+ __ordered_del_inode(ip);
continue;
}
list_move(&ip->i_ordered, &written);
@@ -643,8 +648,7 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
spin_lock(&sdp->sd_ordered_lock);
while (!list_empty(&sdp->sd_log_ordered)) {
ip = list_first_entry(&sdp->sd_log_ordered, struct gfs2_inode, i_ordered);
- list_del(&ip->i_ordered);
- WARN_ON(!test_and_clear_bit(GIF_ORDERED, &ip->i_flags));
+ __ordered_del_inode(ip);
if (ip->i_inode.i_mapping->nrpages == 0)
continue;
spin_unlock(&sdp->sd_ordered_lock);
@@ -659,8 +663,7 @@ void gfs2_ordered_del_inode(struct gfs2_inode *ip)
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
spin_lock(&sdp->sd_ordered_lock);
- if (test_and_clear_bit(GIF_ORDERED, &ip->i_flags))
- list_del(&ip->i_ordered);
+ __ordered_del_inode(ip);
spin_unlock(&sdp->sd_ordered_lock);
}
@@ -1002,6 +1005,16 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
out:
if (gfs2_withdrawn(sdp)) {
+ /**
+ * If the tr_list is empty, we're withdrawing during a log
+ * flush that targets a transaction, but the transaction was
+ * never queued onto any of the ail lists. Here we add it to
+ * ail1 just so that ail_drain() will find and free it.
+ */
+ spin_lock(&sdp->sd_ail_lock);
+ if (tr && list_empty(&tr->tr_list))
+ list_add(&tr->tr_list, &sdp->sd_ail1_list);
+ spin_unlock(&sdp->sd_ail_lock);
ail_drain(sdp); /* frees all transactions */
tr = NULL;
}
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index c1cd6ae17659..8965c751a303 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -53,9 +53,9 @@ static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip)
if (gfs2_is_jdata(ip) || !gfs2_is_ordered(sdp))
return;
- if (!test_bit(GIF_ORDERED, &ip->i_flags)) {
+ if (list_empty(&ip->i_ordered)) {
spin_lock(&sdp->sd_ordered_lock);
- if (!test_and_set_bit(GIF_ORDERED, &ip->i_flags))
+ if (list_empty(&ip->i_ordered))
list_add(&ip->i_ordered, &sdp->sd_log_ordered);
spin_unlock(&sdp->sd_ordered_lock);
}
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 733470ca6be9..c7393ee9cf68 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -39,6 +39,7 @@ static void gfs2_init_inode_once(void *foo)
atomic_set(&ip->i_sizehint, 0);
init_rwsem(&ip->i_rw_mutex);
INIT_LIST_HEAD(&ip->i_trunc_list);
+ INIT_LIST_HEAD(&ip->i_ordered);
ip->i_qadata = NULL;
gfs2_holder_mark_uninitialized(&ip->i_rgd_gh);
memset(&ip->i_res, 0, sizeof(ip->i_res));
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 094f5fe7c009..6d18d2c91add 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1136,7 +1136,18 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc)
goto fail_per_node;
}
- if (!sb_rdonly(sb)) {
+ if (sb_rdonly(sb)) {
+ struct gfs2_holder freeze_gh;
+
+ error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED,
+ LM_FLAG_NOEXP | GL_EXACT,
+ &freeze_gh);
+ if (error) {
+ fs_err(sdp, "can't make FS RO: %d\n", error);
+ goto fail_per_node;
+ }
+ gfs2_glock_dq_uninit(&freeze_gh);
+ } else {
error = gfs2_make_fs_rw(sdp);
if (error) {
fs_err(sdp, "can't make FS RW: %d\n", error);
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 96c345f49273..390ea79d682c 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -364,8 +364,8 @@ void gfs2_recover_func(struct work_struct *work)
/* Acquire a shared hold on the freeze lock */
error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED,
- LM_FLAG_NOEXP | LM_FLAG_PRIORITY,
- &thaw_gh);
+ LM_FLAG_NOEXP | LM_FLAG_PRIORITY |
+ GL_EXACT, &thaw_gh);
if (error)
goto fail_gunlock_ji;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 32d8d26126a1..47d0ae158b69 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -167,7 +167,8 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
if (error)
return error;
- error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, 0,
+ error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED,
+ LM_FLAG_NOEXP | GL_EXACT,
&freeze_gh);
if (error)
goto fail_threads;
@@ -203,7 +204,6 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
return 0;
fail:
- freeze_gh.gh_flags |= GL_NOCACHE;
gfs2_glock_dq_uninit(&freeze_gh);
fail_threads:
if (sdp->sd_quotad_process)
@@ -430,7 +430,7 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp)
}
error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_EXCLUSIVE,
- GL_NOCACHE, &sdp->sd_freeze_gh);
+ LM_FLAG_NOEXP, &sdp->sd_freeze_gh);
if (error)
goto out;
@@ -613,13 +613,15 @@ int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
!gfs2_glock_is_locked_by_me(sdp->sd_freeze_gl)) {
if (!log_write_allowed) {
error = gfs2_glock_nq_init(sdp->sd_freeze_gl,
- LM_ST_SHARED, GL_NOCACHE |
- LM_FLAG_TRY, &freeze_gh);
+ LM_ST_SHARED, LM_FLAG_TRY |
+ LM_FLAG_NOEXP | GL_EXACT,
+ &freeze_gh);
if (error == GLR_TRYFAILED)
error = 0;
} else {
error = gfs2_glock_nq_init(sdp->sd_freeze_gl,
- LM_ST_SHARED, GL_NOCACHE,
+ LM_ST_SHARED,
+ LM_FLAG_NOEXP | GL_EXACT,
&freeze_gh);
if (error && !gfs2_withdrawn(sdp))
return error;
@@ -761,8 +763,8 @@ void gfs2_freeze_func(struct work_struct *work)
struct super_block *sb = sdp->sd_vfs;
atomic_inc(&sb->s_active);
- error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, 0,
- &freeze_gh);
+ error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED,
+ LM_FLAG_NOEXP | GL_EXACT, &freeze_gh);
if (error) {
fs_info(sdp, "GFS2: couldn't get freeze lock : %d\n", error);
gfs2_assert_withdraw(sdp, 0);
@@ -774,8 +776,6 @@ void gfs2_freeze_func(struct work_struct *work)
error);
gfs2_assert_withdraw(sdp, 0);
}
- if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
- freeze_gh.gh_flags |= GL_NOCACHE;
gfs2_glock_dq_uninit(&freeze_gh);
}
deactivate_super(sb);
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 0b65a912b036..47c5f3aeb460 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -903,13 +903,15 @@ void io_wq_cancel_all(struct io_wq *wq)
struct io_cb_cancel_data {
work_cancel_fn *fn;
void *data;
+ int nr_running;
+ int nr_pending;
+ bool cancel_all;
};
static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
{
struct io_cb_cancel_data *match = data;
unsigned long flags;
- bool ret = false;
/*
* Hold the lock to avoid ->cur_work going out of scope, caller
@@ -920,74 +922,90 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
!(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) &&
match->fn(worker->cur_work, match->data)) {
send_sig(SIGINT, worker->task, 1);
- ret = true;
+ match->nr_running++;
}
spin_unlock_irqrestore(&worker->lock, flags);
- return ret;
+ return match->nr_running && !match->cancel_all;
}
-static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
- struct io_cb_cancel_data *match)
+static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
+ struct io_cb_cancel_data *match)
{
struct io_wq_work_node *node, *prev;
struct io_wq_work *work;
unsigned long flags;
- bool found = false;
- /*
- * First check pending list, if we're lucky we can just remove it
- * from there. CANCEL_OK means that the work is returned as-new,
- * no completion will be posted for it.
- */
+retry:
spin_lock_irqsave(&wqe->lock, flags);
wq_list_for_each(node, prev, &wqe->work_list) {
work = container_of(node, struct io_wq_work, list);
+ if (!match->fn(work, match->data))
+ continue;
- if (match->fn(work, match->data)) {
- wq_list_del(&wqe->work_list, node, prev);
- found = true;
- break;
- }
- }
- spin_unlock_irqrestore(&wqe->lock, flags);
-
- if (found) {
+ wq_list_del(&wqe->work_list, node, prev);
+ spin_unlock_irqrestore(&wqe->lock, flags);
io_run_cancel(work, wqe);
- return IO_WQ_CANCEL_OK;
+ match->nr_pending++;
+ if (!match->cancel_all)
+ return;
+
+ /* not safe to continue after unlock */
+ goto retry;
}
+ spin_unlock_irqrestore(&wqe->lock, flags);
+}
- /*
- * Now check if a free (going busy) or busy worker has the work
- * currently running. If we find it there, we'll return CANCEL_RUNNING
- * as an indication that we attempt to signal cancellation. The
- * completion will run normally in this case.
- */
+static void io_wqe_cancel_running_work(struct io_wqe *wqe,
+ struct io_cb_cancel_data *match)
+{
rcu_read_lock();
- found = io_wq_for_each_worker(wqe, io_wq_worker_cancel, match);
+ io_wq_for_each_worker(wqe, io_wq_worker_cancel, match);
rcu_read_unlock();
- return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND;
}
enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
- void *data)
+ void *data, bool cancel_all)
{
struct io_cb_cancel_data match = {
- .fn = cancel,
- .data = data,
+ .fn = cancel,
+ .data = data,
+ .cancel_all = cancel_all,
};
- enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
int node;
+ /*
+ * First check pending list, if we're lucky we can just remove it
+ * from there. CANCEL_OK means that the work is returned as-new,
+ * no completion will be posted for it.
+ */
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
- ret = io_wqe_cancel_work(wqe, &match);
- if (ret != IO_WQ_CANCEL_NOTFOUND)
- break;
+ io_wqe_cancel_pending_work(wqe, &match);
+ if (match.nr_pending && !match.cancel_all)
+ return IO_WQ_CANCEL_OK;
}
- return ret;
+ /*
+ * Now check if a free (going busy) or busy worker has the work
+ * currently running. If we find it there, we'll return CANCEL_RUNNING
+ * as an indication that we attempt to signal cancellation. The
+ * completion will run normally in this case.
+ */
+ for_each_node(node) {
+ struct io_wqe *wqe = wq->wqes[node];
+
+ io_wqe_cancel_running_work(wqe, &match);
+ if (match.nr_running && !match.cancel_all)
+ return IO_WQ_CANCEL_RUNNING;
+ }
+
+ if (match.nr_running)
+ return IO_WQ_CANCEL_RUNNING;
+ if (match.nr_pending)
+ return IO_WQ_CANCEL_OK;
+ return IO_WQ_CANCEL_NOTFOUND;
}
static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data)
@@ -997,21 +1015,7 @@ static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data)
enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
{
- return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork);
-}
-
-static bool io_wq_pid_match(struct io_wq_work *work, void *data)
-{
- pid_t pid = (pid_t) (unsigned long) data;
-
- return work->task_pid == pid;
-}
-
-enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid)
-{
- void *data = (void *) (unsigned long) pid;
-
- return io_wq_cancel_cb(wq, io_wq_pid_match, data);
+ return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork, false);
}
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 8e138fa88b9f..071f1a997800 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -90,7 +90,6 @@ struct io_wq_work {
const struct cred *creds;
struct fs_struct *fs;
unsigned flags;
- pid_t task_pid;
};
static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
@@ -125,12 +124,11 @@ static inline bool io_wq_is_hashed(struct io_wq_work *work)
void io_wq_cancel_all(struct io_wq *wq);
enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork);
-enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid);
typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
- void *data);
+ void *data, bool cancel_all);
struct task_struct *io_wq_get_task(struct io_wq *wq);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 155f3d830ddb..32b0064f806e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -541,6 +541,7 @@ enum {
REQ_F_NO_FILE_TABLE_BIT,
REQ_F_QUEUE_TIMEOUT_BIT,
REQ_F_WORK_INITIALIZED_BIT,
+ REQ_F_TASK_PINNED_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
@@ -598,10 +599,13 @@ enum {
REQ_F_QUEUE_TIMEOUT = BIT(REQ_F_QUEUE_TIMEOUT_BIT),
/* io_wq_work is initialized */
REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT),
+ /* req->task is refcounted */
+ REQ_F_TASK_PINNED = BIT(REQ_F_TASK_PINNED_BIT),
};
struct async_poll {
struct io_poll_iocb poll;
+ struct io_poll_iocb *double_poll;
struct io_wq_work work;
};
@@ -887,6 +891,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_files_update *ip,
unsigned nr_args);
static int io_grab_files(struct io_kiocb *req);
+static void io_complete_rw_common(struct kiocb *kiocb, long res);
static void io_cleanup_req(struct io_kiocb *req);
static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
int fd, struct file **out_file, bool fixed);
@@ -910,6 +915,21 @@ struct sock *io_uring_get_socket(struct file *file)
}
EXPORT_SYMBOL(io_uring_get_socket);
+static void io_get_req_task(struct io_kiocb *req)
+{
+ if (req->flags & REQ_F_TASK_PINNED)
+ return;
+ get_task_struct(req->task);
+ req->flags |= REQ_F_TASK_PINNED;
+}
+
+/* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */
+static void __io_put_req_task(struct io_kiocb *req)
+{
+ if (req->flags & REQ_F_TASK_PINNED)
+ put_task_struct(req->task);
+}
+
static void io_file_put_work(struct work_struct *work);
/*
@@ -1045,8 +1065,6 @@ static inline void io_req_work_grab_env(struct io_kiocb *req,
}
spin_unlock(&current->fs->lock);
}
- if (!req->work.task_pid)
- req->work.task_pid = task_pid_vnr(current);
}
static inline void io_req_work_drop_env(struct io_kiocb *req)
@@ -1079,6 +1097,8 @@ static inline void io_prep_async_work(struct io_kiocb *req,
{
const struct io_op_def *def = &io_op_defs[req->opcode];
+ io_req_init_async(req);
+
if (req->flags & REQ_F_ISREG) {
if (def->hash_reg_file)
io_wq_hash_work(&req->work, file_inode(req->file));
@@ -1256,6 +1276,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
if (cqe) {
clear_bit(0, &ctx->sq_check_overflow);
clear_bit(0, &ctx->cq_check_overflow);
+ ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
}
spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
@@ -1293,6 +1314,7 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
if (list_empty(&ctx->cq_overflow_list)) {
set_bit(0, &ctx->sq_check_overflow);
set_bit(0, &ctx->cq_check_overflow);
+ ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
}
req->flags |= REQ_F_OVERFLOW;
refcount_inc(&req->refs);
@@ -1398,9 +1420,7 @@ static void __io_req_aux_free(struct io_kiocb *req)
kfree(req->io);
if (req->file)
io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
- if (req->task)
- put_task_struct(req->task);
-
+ __io_put_req_task(req);
io_req_work_drop_env(req);
}
@@ -1727,6 +1747,26 @@ static int io_put_kbuf(struct io_kiocb *req)
return cflags;
}
+static void io_iopoll_queue(struct list_head *again)
+{
+ struct io_kiocb *req;
+
+ do {
+ req = list_first_entry(again, struct io_kiocb, list);
+ list_del(&req->list);
+
+ /* shouldn't happen unless io_uring is dying, cancel reqs */
+ if (unlikely(!current->mm)) {
+ io_complete_rw_common(&req->rw.kiocb, -EAGAIN);
+ io_put_req(req);
+ continue;
+ }
+
+ refcount_inc(&req->refs);
+ io_queue_async_work(req);
+ } while (!list_empty(again));
+}
+
/*
* Find and free completed poll iocbs
*/
@@ -1735,12 +1775,21 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
{
struct req_batch rb;
struct io_kiocb *req;
+ LIST_HEAD(again);
+
+ /* order with ->result store in io_complete_rw_iopoll() */
+ smp_rmb();
rb.to_free = rb.need_iter = 0;
while (!list_empty(done)) {
int cflags = 0;
req = list_first_entry(done, struct io_kiocb, list);
+ if (READ_ONCE(req->result) == -EAGAIN) {
+ req->iopoll_completed = 0;
+ list_move_tail(&req->list, &again);
+ continue;
+ }
list_del(&req->list);
if (req->flags & REQ_F_BUFFER_SELECTED)
@@ -1758,18 +1807,9 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
if (ctx->flags & IORING_SETUP_SQPOLL)
io_cqring_ev_posted(ctx);
io_free_req_many(ctx, &rb);
-}
-
-static void io_iopoll_queue(struct list_head *again)
-{
- struct io_kiocb *req;
- do {
- req = list_first_entry(again, struct io_kiocb, list);
- list_del(&req->list);
- refcount_inc(&req->refs);
- io_queue_async_work(req);
- } while (!list_empty(again));
+ if (!list_empty(&again))
+ io_iopoll_queue(&again);
}
static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
@@ -1777,7 +1817,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
{
struct io_kiocb *req, *tmp;
LIST_HEAD(done);
- LIST_HEAD(again);
bool spin;
int ret;
@@ -1803,13 +1842,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
if (!list_empty(&done))
break;
- if (req->result == -EAGAIN) {
- list_move_tail(&req->list, &again);
- continue;
- }
- if (!list_empty(&again))
- break;
-
ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
if (ret < 0)
break;
@@ -1822,9 +1854,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
if (!list_empty(&done))
io_iopoll_complete(ctx, nr_events, &done);
- if (!list_empty(&again))
- io_iopoll_queue(&again);
-
return ret;
}
@@ -1973,11 +2002,13 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
if (kiocb->ki_flags & IOCB_WRITE)
kiocb_end_write(req);
- if (res != req->result)
+ if (res != -EAGAIN && res != req->result)
req_set_fail_links(req);
- req->result = res;
- if (res != -EAGAIN)
- WRITE_ONCE(req->iopoll_completed, 1);
+
+ WRITE_ONCE(req->result, res);
+ /* order with io_poll_complete() checking ->result */
+ smp_wmb();
+ WRITE_ONCE(req->iopoll_completed, 1);
}
/*
@@ -2650,8 +2681,8 @@ copy_iov:
}
}
out_free:
- kfree(iovec);
- req->flags &= ~REQ_F_NEED_CLEANUP;
+ if (!(req->flags & REQ_F_NEED_CLEANUP))
+ kfree(iovec);
return ret;
}
@@ -2773,8 +2804,8 @@ copy_iov:
}
}
out_free:
- req->flags &= ~REQ_F_NEED_CLEANUP;
- kfree(iovec);
+ if (!(req->flags & REQ_F_NEED_CLEANUP))
+ kfree(iovec);
return ret;
}
@@ -3524,6 +3555,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (req->flags & REQ_F_NEED_CLEANUP)
return 0;
+ io->msg.msg.msg_name = &io->msg.addr;
io->msg.iov = io->msg.fast_iov;
ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
&io->msg.iov);
@@ -3705,6 +3737,7 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
{
+ io->msg.msg.msg_name = &io->msg.addr;
io->msg.iov = io->msg.fast_iov;
#ifdef CONFIG_COMPAT
@@ -3813,10 +3846,16 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg,
kmsg->uaddr, flags);
- if (force_nonblock && ret == -EAGAIN)
- return io_setup_async_msg(req, kmsg);
+ if (force_nonblock && ret == -EAGAIN) {
+ ret = io_setup_async_msg(req, kmsg);
+ if (ret != -EAGAIN)
+ kfree(kbuf);
+ return ret;
+ }
if (ret == -ERESTARTSYS)
ret = -EINTR;
+ if (kbuf)
+ kfree(kbuf);
}
if (kmsg && kmsg->iov != kmsg->fast_iov)
@@ -4045,6 +4084,29 @@ struct io_poll_table {
int error;
};
+static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb)
+{
+ struct task_struct *tsk = req->task;
+ struct io_ring_ctx *ctx = req->ctx;
+ int ret, notify = TWA_RESUME;
+
+ /*
+ * SQPOLL kernel thread doesn't need notification, just a wakeup.
+ * If we're not using an eventfd, then TWA_RESUME is always fine,
+ * as we won't have dependencies between request completions for
+ * other kernel wait conditions.
+ */
+ if (ctx->flags & IORING_SETUP_SQPOLL)
+ notify = 0;
+ else if (ctx->cq_ev_fd)
+ notify = TWA_SIGNAL;
+
+ ret = task_work_add(tsk, cb, notify);
+ if (!ret)
+ wake_up_process(tsk);
+ return ret;
+}
+
static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
__poll_t mask, task_work_func_t func)
{
@@ -4068,13 +4130,13 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
* of executing it. We can't safely execute it anyway, as we may not
* have the needed state needed for it anyway.
*/
- ret = task_work_add(tsk, &req->task_work, true);
+ ret = io_req_task_work_add(req, &req->task_work);
if (unlikely(ret)) {
WRITE_ONCE(poll->canceled, true);
tsk = io_wq_get_task(req->ctx->io_wq);
- task_work_add(tsk, &req->task_work, true);
+ task_work_add(tsk, &req->task_work, 0);
+ wake_up_process(tsk);
}
- wake_up_process(tsk);
return 1;
}
@@ -4098,9 +4160,9 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
return false;
}
-static void io_poll_remove_double(struct io_kiocb *req)
+static void io_poll_remove_double(struct io_kiocb *req, void *data)
{
- struct io_poll_iocb *poll = (struct io_poll_iocb *) req->io;
+ struct io_poll_iocb *poll = data;
lockdep_assert_held(&req->ctx->completion_lock);
@@ -4120,7 +4182,7 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
{
struct io_ring_ctx *ctx = req->ctx;
- io_poll_remove_double(req);
+ io_poll_remove_double(req, req->io);
req->poll.done = true;
io_cqring_fill_event(req, error ? error : mangle_poll(mask));
io_commit_cqring(ctx);
@@ -4163,21 +4225,21 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
int sync, void *key)
{
struct io_kiocb *req = wait->private;
- struct io_poll_iocb *poll = (struct io_poll_iocb *) req->io;
+ struct io_poll_iocb *poll = req->apoll->double_poll;
__poll_t mask = key_to_poll(key);
/* for instances that support it check for an event match first: */
if (mask && !(mask & poll->events))
return 0;
- if (req->poll.head) {
+ if (poll && poll->head) {
bool done;
- spin_lock(&req->poll.head->lock);
- done = list_empty(&req->poll.wait.entry);
+ spin_lock(&poll->head->lock);
+ done = list_empty(&poll->wait.entry);
if (!done)
- list_del_init(&req->poll.wait.entry);
- spin_unlock(&req->poll.head->lock);
+ list_del_init(&poll->wait.entry);
+ spin_unlock(&poll->head->lock);
if (!done)
__io_async_wake(req, poll, mask, io_poll_task_func);
}
@@ -4197,7 +4259,8 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
}
static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
- struct wait_queue_head *head)
+ struct wait_queue_head *head,
+ struct io_poll_iocb **poll_ptr)
{
struct io_kiocb *req = pt->req;
@@ -4208,7 +4271,7 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
*/
if (unlikely(poll->head)) {
/* already have a 2nd entry, fail a third attempt */
- if (req->io) {
+ if (*poll_ptr) {
pt->error = -EINVAL;
return;
}
@@ -4220,7 +4283,7 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
io_init_poll_iocb(poll, req->poll.events, io_poll_double_wake);
refcount_inc(&req->refs);
poll->wait.private = req;
- req->io = (void *) poll;
+ *poll_ptr = poll;
}
pt->error = 0;
@@ -4232,8 +4295,31 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
struct poll_table_struct *p)
{
struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
+ struct async_poll *apoll = pt->req->apoll;
- __io_queue_proc(&pt->req->apoll->poll, pt, head);
+ __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
+}
+
+static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
+{
+ struct mm_struct *mm = current->mm;
+
+ if (mm) {
+ kthread_unuse_mm(mm);
+ mmput(mm);
+ }
+}
+
+static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
+ struct io_kiocb *req)
+{
+ if (io_op_defs[req->opcode].needs_mm && !current->mm) {
+ if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
+ return -EFAULT;
+ kthread_use_mm(ctx->sqo_mm);
+ }
+
+ return 0;
}
static void io_async_task_func(struct callback_head *cb)
@@ -4261,20 +4347,27 @@ static void io_async_task_func(struct callback_head *cb)
}
}
+ io_poll_remove_double(req, apoll->double_poll);
spin_unlock_irq(&ctx->completion_lock);
/* restore ->work in case we need to retry again */
if (req->flags & REQ_F_WORK_INITIALIZED)
memcpy(&req->work, &apoll->work, sizeof(req->work));
+ kfree(apoll->double_poll);
kfree(apoll);
if (!canceled) {
__set_current_state(TASK_RUNNING);
+ if (io_sq_thread_acquire_mm(ctx, req)) {
+ io_cqring_add_event(req, -EFAULT);
+ goto end_req;
+ }
mutex_lock(&ctx->uring_lock);
__io_queue_sqe(req, NULL);
mutex_unlock(&ctx->uring_lock);
} else {
io_cqring_ev_posted(ctx);
+end_req:
req_set_fail_links(req);
io_double_put_req(req);
}
@@ -4348,7 +4441,6 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
struct async_poll *apoll;
struct io_poll_table ipt;
__poll_t mask, ret;
- bool had_io;
if (!req->file || !file_can_poll(req->file))
return false;
@@ -4360,14 +4452,13 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
if (unlikely(!apoll))
return false;
+ apoll->double_poll = NULL;
req->flags |= REQ_F_POLLED;
if (req->flags & REQ_F_WORK_INITIALIZED)
memcpy(&apoll->work, &req->work, sizeof(req->work));
- had_io = req->io != NULL;
- get_task_struct(current);
- req->task = current;
+ io_get_req_task(req);
req->apoll = apoll;
INIT_HLIST_NODE(&req->hash_node);
@@ -4383,13 +4474,11 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
io_async_wake);
if (ret) {
- ipt.error = 0;
- /* only remove double add if we did it here */
- if (!had_io)
- io_poll_remove_double(req);
+ io_poll_remove_double(req, apoll->double_poll);
spin_unlock_irq(&ctx->completion_lock);
if (req->flags & REQ_F_WORK_INITIALIZED)
memcpy(&req->work, &apoll->work, sizeof(req->work));
+ kfree(apoll->double_poll);
kfree(apoll);
return false;
}
@@ -4420,11 +4509,13 @@ static bool io_poll_remove_one(struct io_kiocb *req)
bool do_complete;
if (req->opcode == IORING_OP_POLL_ADD) {
- io_poll_remove_double(req);
+ io_poll_remove_double(req, req->io);
do_complete = __io_poll_remove_one(req, &req->poll);
} else {
struct async_poll *apoll = req->apoll;
+ io_poll_remove_double(req, apoll->double_poll);
+
/* non-poll requests have submit ref still */
do_complete = __io_poll_remove_one(req, &apoll->poll);
if (do_complete) {
@@ -4437,6 +4528,7 @@ static bool io_poll_remove_one(struct io_kiocb *req)
if (req->flags & REQ_F_WORK_INITIALIZED)
memcpy(&req->work, &apoll->work,
sizeof(req->work));
+ kfree(apoll->double_poll);
kfree(apoll);
}
}
@@ -4537,7 +4629,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
{
struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
- __io_queue_proc(&pt->req->poll, pt, head);
+ __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->io);
}
static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -4555,8 +4647,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
events = READ_ONCE(sqe->poll_events);
poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
- get_task_struct(current);
- req->task = current;
+ io_get_req_task(req);
return 0;
}
@@ -4646,7 +4737,9 @@ static int io_timeout_remove_prep(struct io_kiocb *req,
{
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len)
+ if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->buf_index || sqe->len)
return -EINVAL;
req->timeout.addr = READ_ONCE(sqe->addr);
@@ -4772,7 +4865,7 @@ static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
enum io_wq_cancel cancel_ret;
int ret = 0;
- cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr);
+ cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr, false);
switch (cancel_ret) {
case IO_WQ_CANCEL_OK:
ret = 0;
@@ -4824,8 +4917,9 @@ static int io_async_cancel_prep(struct io_kiocb *req,
{
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->flags || sqe->ioprio || sqe->off || sqe->len ||
- sqe->cancel_flags)
+ if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags)
return -EINVAL;
req->cancel.addr = READ_ONCE(sqe->addr);
@@ -4843,7 +4937,9 @@ static int io_async_cancel(struct io_kiocb *req)
static int io_files_update_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
- if (sqe->flags || sqe->ioprio || sqe->rw_flags)
+ if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->rw_flags)
return -EINVAL;
req->files_update.offset = READ_ONCE(sqe->off);
@@ -5308,9 +5404,6 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) {
const bool in_async = io_wq_current_is_worker();
- if (req->result == -EAGAIN)
- return -EAGAIN;
-
/* workqueue context doesn't hold uring_lock, grab it now */
if (in_async)
mutex_lock(&ctx->uring_lock);
@@ -5637,6 +5730,7 @@ fail_req:
* Never try inline submit of IOSQE_ASYNC is set, go straight
* to async execution.
*/
+ io_req_init_async(req);
req->work.flags |= IO_WQ_WORK_CONCURRENT;
io_queue_async_work(req);
} else {
@@ -5817,17 +5911,14 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
req->flags = 0;
/* one is dropped after submission, the other at completion */
refcount_set(&req->refs, 2);
- req->task = NULL;
+ req->task = current;
req->result = 0;
if (unlikely(req->opcode >= IORING_OP_LAST))
return -EINVAL;
- if (io_op_defs[req->opcode].needs_mm && !current->mm) {
- if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
- return -EFAULT;
- kthread_use_mm(ctx->sqo_mm);
- }
+ if (unlikely(io_sq_thread_acquire_mm(ctx, req)))
+ return -EFAULT;
sqe_flags = READ_ONCE(sqe->flags);
/* enforce forwards compatibility on users */
@@ -5936,16 +6027,6 @@ fail_req:
return submitted;
}
-static inline void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
-{
- struct mm_struct *mm = current->mm;
-
- if (mm) {
- kthread_unuse_mm(mm);
- mmput(mm);
- }
-}
-
static int io_sq_thread(void *data)
{
struct io_ring_ctx *ctx = data;
@@ -5979,7 +6060,7 @@ static int io_sq_thread(void *data)
* If submit got -EBUSY, flag us as needing the application
* to enter the kernel to reap and flush events.
*/
- if (!to_submit || ret == -EBUSY) {
+ if (!to_submit || ret == -EBUSY || need_resched()) {
/*
* Drop cur_mm before scheduling, we can't hold it for
* long periods (or over schedule()). Do this before
@@ -5995,7 +6076,7 @@ static int io_sq_thread(void *data)
* more IO, we should wait for the application to
* reap events and wake us up.
*/
- if (!list_empty(&ctx->poll_list) ||
+ if (!list_empty(&ctx->poll_list) || need_resched() ||
(!time_after(jiffies, timeout) && ret != -EBUSY &&
!percpu_ref_is_dying(&ctx->refs))) {
if (current->task_works)
@@ -6021,9 +6102,9 @@ static int io_sq_thread(void *data)
}
/* Tell userspace we may need a wakeup call */
+ spin_lock_irq(&ctx->completion_lock);
ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
- /* make sure to read SQ tail after writing flags */
- smp_mb();
+ spin_unlock_irq(&ctx->completion_lock);
to_submit = io_sqring_entries(ctx);
if (!to_submit || ret == -EBUSY) {
@@ -6041,13 +6122,17 @@ static int io_sq_thread(void *data)
schedule();
finish_wait(&ctx->sqo_wait, &wait);
+ spin_lock_irq(&ctx->completion_lock);
ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
+ spin_unlock_irq(&ctx->completion_lock);
ret = 0;
continue;
}
finish_wait(&ctx->sqo_wait, &wait);
+ spin_lock_irq(&ctx->completion_lock);
ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
+ spin_unlock_irq(&ctx->completion_lock);
}
mutex_lock(&ctx->uring_lock);
@@ -6146,15 +6231,23 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
do {
prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
TASK_INTERRUPTIBLE);
+ /* make sure we run task_work before checking for signals */
if (current->task_works)
task_work_run();
- if (io_should_wake(&iowq, false))
- break;
- schedule();
if (signal_pending(current)) {
+ if (current->jobctl & JOBCTL_TASK_WORK) {
+ spin_lock_irq(&current->sighand->siglock);
+ current->jobctl &= ~JOBCTL_TASK_WORK;
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+ continue;
+ }
ret = -EINTR;
break;
}
+ if (io_should_wake(&iowq, false))
+ break;
+ schedule();
} while (1);
finish_wait(&ctx->wait, &iowq.wq);
@@ -6626,6 +6719,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
for (i = 0; i < nr_tables; i++)
kfree(ctx->file_data->table[i].files);
+ percpu_ref_exit(&ctx->file_data->refs);
kfree(ctx->file_data->table);
kfree(ctx->file_data);
ctx->file_data = NULL;
@@ -6778,8 +6872,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
}
table->files[index] = file;
err = io_sqe_file_register(ctx, file, i);
- if (err)
+ if (err) {
+ fput(file);
break;
+ }
}
nr_args--;
done++;
@@ -7275,9 +7371,6 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_mem_free(ctx->sq_sqes);
percpu_ref_exit(&ctx->refs);
- if (ctx->account_mem)
- io_unaccount_mem(ctx->user,
- ring_pages(ctx->sq_entries, ctx->cq_entries));
free_uid(ctx->user);
put_cred(ctx->creds);
kfree(ctx->cancel_hash);
@@ -7331,7 +7424,17 @@ static void io_ring_exit_work(struct work_struct *work)
if (ctx->rings)
io_cqring_overflow_flush(ctx, true);
- wait_for_completion(&ctx->ref_comp);
+ /*
+ * If we're doing polled IO and end up having requests being
+ * submitted async (out-of-line), then completions can come in while
+ * we're waiting for refs to drop. We need to reap these manually,
+ * as nobody else will be looking for them.
+ */
+ while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)) {
+ io_iopoll_reap_events(ctx);
+ if (ctx->rings)
+ io_cqring_overflow_flush(ctx, true);
+ }
io_ring_ctx_free(ctx);
}
@@ -7352,6 +7455,16 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
if (ctx->rings)
io_cqring_overflow_flush(ctx, true);
idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
+
+ /*
+ * Do this upfront, so we won't have a grace period where the ring
+ * is closed but resources aren't reaped yet. This can cause
+ * spurious failure in setting up a new ring.
+ */
+ if (ctx->account_mem)
+ io_unaccount_mem(ctx->user,
+ ring_pages(ctx->sq_entries, ctx->cq_entries));
+
INIT_WORK(&ctx->exit_work, io_ring_exit_work);
queue_work(system_wq, &ctx->exit_work);
}
@@ -7365,9 +7478,22 @@ static int io_uring_release(struct inode *inode, struct file *file)
return 0;
}
+static bool io_wq_files_match(struct io_wq_work *work, void *data)
+{
+ struct files_struct *files = data;
+
+ return work->files == files;
+}
+
static void io_uring_cancel_files(struct io_ring_ctx *ctx,
struct files_struct *files)
{
+ if (list_empty_careful(&ctx->inflight_list))
+ return;
+
+ /* cancel all at once, should be faster than doing it one by one*/
+ io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true);
+
while (!list_empty_careful(&ctx->inflight_list)) {
struct io_kiocb *cancel_req = NULL, *req;
DEFINE_WAIT(wait);
@@ -7398,6 +7524,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
if (list_empty(&ctx->cq_overflow_list)) {
clear_bit(0, &ctx->sq_check_overflow);
clear_bit(0, &ctx->cq_check_overflow);
+ ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
}
spin_unlock_irq(&ctx->completion_lock);
@@ -7423,6 +7550,14 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
}
}
+static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
+{
+ struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ struct task_struct *task = data;
+
+ return req->task == task;
+}
+
static int io_uring_flush(struct file *file, void *data)
{
struct io_ring_ctx *ctx = file->private_data;
@@ -7433,7 +7568,7 @@ static int io_uring_flush(struct file *file, void *data)
* If the task is going away, cancel work it may have pending
*/
if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
- io_wq_cancel_pid(ctx->io_wq, task_pid_vnr(current));
+ io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, current, true);
return 0;
}
diff --git a/fs/namespace.c b/fs/namespace.c
index f30ed401cc6d..4a0f600a3328 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2603,6 +2603,7 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags,
if (IS_ERR(fc))
return PTR_ERR(fc);
+ fc->oldapi = true;
err = parse_monolithic_mount_data(fc, data);
if (!err) {
down_write(&sb->s_umount);
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 7d399f72ebbb..de03e440b7ee 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -907,9 +907,8 @@ retry:
goto out_mds;
/* Use a direct mapping of ds_idx to pgio mirror_idx */
- if (WARN_ON_ONCE(pgio->pg_mirror_count !=
- FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg)))
- goto out_mds;
+ if (pgio->pg_mirror_count != FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg))
+ goto out_eagain;
for (i = 0; i < pgio->pg_mirror_count; i++) {
mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
@@ -931,7 +930,10 @@ retry:
(NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR))
pgio->pg_maxretrans = io_maxretrans;
return;
-
+out_eagain:
+ pnfs_generic_pg_cleanup(pgio);
+ pgio->pg_error = -EAGAIN;
+ return;
out_mds:
trace_pnfs_mds_fallback_pg_init_write(pgio->pg_inode,
0, NFS4_MAX_UINT64, IOMODE_RW,
@@ -941,6 +943,7 @@ out_mds:
pgio->pg_lseg = NULL;
pgio->pg_maxretrans = 0;
nfs_pageio_reset_write_mds(pgio);
+ pgio->pg_error = -EAGAIN;
}
static unsigned int
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index a3ab6e219061..873342308dc0 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -308,6 +308,7 @@ static int try_location(struct fs_context *fc,
if (IS_ERR(export_path))
return PTR_ERR(export_path);
+ kfree(ctx->nfs_server.export_path);
ctx->nfs_server.export_path = export_path;
source = kmalloc(len + 1 + ctx->nfs_server.export_path_len + 1,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e32717fd1169..2e2dac29a9e9 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -774,6 +774,14 @@ static void nfs4_slot_sequence_acked(struct nfs4_slot *slot,
slot->seq_nr_last_acked = seqnr;
}
+static void nfs4_probe_sequence(struct nfs_client *client, const struct cred *cred,
+ struct nfs4_slot *slot)
+{
+ struct rpc_task *task = _nfs41_proc_sequence(client, cred, slot, true);
+ if (!IS_ERR(task))
+ rpc_put_task_async(task);
+}
+
static int nfs41_sequence_process(struct rpc_task *task,
struct nfs4_sequence_res *res)
{
@@ -790,6 +798,7 @@ static int nfs41_sequence_process(struct rpc_task *task,
goto out;
session = slot->table->session;
+ clp = session->clp;
trace_nfs4_sequence_done(session, res);
@@ -804,7 +813,6 @@ static int nfs41_sequence_process(struct rpc_task *task,
nfs4_slot_sequence_acked(slot, slot->seq_nr);
/* Update the slot's sequence and clientid lease timer */
slot->seq_done = 1;
- clp = session->clp;
do_renew_lease(clp, res->sr_timestamp);
/* Check sequence flags */
nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags,
@@ -852,10 +860,18 @@ static int nfs41_sequence_process(struct rpc_task *task,
/*
* Were one or more calls using this slot interrupted?
* If the server never received the request, then our
- * transmitted slot sequence number may be too high.
+ * transmitted slot sequence number may be too high. However,
+ * if the server did receive the request then it might
+ * accidentally give us a reply with a mismatched operation.
+ * We can sort this out by sending a lone sequence operation
+ * to the server on the same slot.
*/
if ((s32)(slot->seq_nr - slot->seq_nr_last_acked) > 1) {
slot->seq_nr--;
+ if (task->tk_msg.rpc_proc != &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE]) {
+ nfs4_probe_sequence(clp, task->tk_msg.rpc_cred, slot);
+ res->sr_slot = NULL;
+ }
goto retry_nowait;
}
/*
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index bb3d2c32664a..c9056316a0b3 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -507,6 +507,17 @@ find_any_file(struct nfs4_file *f)
return ret;
}
+static struct nfsd_file *find_deleg_file(struct nfs4_file *f)
+{
+ struct nfsd_file *ret = NULL;
+
+ spin_lock(&f->fi_lock);
+ if (f->fi_deleg_file)
+ ret = nfsd_file_get(f->fi_deleg_file);
+ spin_unlock(&f->fi_lock);
+ return ret;
+}
+
static atomic_long_t num_delegations;
unsigned long max_delegations;
@@ -2444,6 +2455,8 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st)
oo = ols->st_stateowner;
nf = st->sc_file;
file = find_any_file(nf);
+ if (!file)
+ return 0;
seq_printf(s, "- ");
nfs4_show_stateid(s, &st->sc_stateid);
@@ -2481,6 +2494,8 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st)
oo = ols->st_stateowner;
nf = st->sc_file;
file = find_any_file(nf);
+ if (!file)
+ return 0;
seq_printf(s, "- ");
nfs4_show_stateid(s, &st->sc_stateid);
@@ -2513,7 +2528,9 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
ds = delegstateid(st);
nf = st->sc_file;
- file = nf->fi_deleg_file;
+ file = find_deleg_file(nf);
+ if (!file)
+ return 0;
seq_printf(s, "- ");
nfs4_show_stateid(s, &st->sc_stateid);
@@ -2529,6 +2546,7 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
seq_printf(s, ", ");
nfs4_show_fname(s, file);
seq_printf(s, " }\n");
+ nfsd_file_put(file);
return 0;
}
@@ -7912,9 +7930,14 @@ nfs4_state_start_net(struct net *net)
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
int ret;
- ret = nfs4_state_create_net(net);
+ ret = get_nfsdfs(net);
if (ret)
return ret;
+ ret = nfs4_state_create_net(net);
+ if (ret) {
+ mntput(nn->nfsd_mnt);
+ return ret;
+ }
locks_start_grace(net, &nn->nfsd4_manager);
nfsd4_client_tracking_init(net);
if (nn->track_reclaim_completes && nn->reclaim_str_hashtbl_size == 0)
@@ -7984,6 +8007,7 @@ nfs4_state_shutdown_net(struct net *net)
nfsd4_client_tracking_exit(net);
nfs4_state_destroy_net(net);
+ mntput(nn->nfsd_mnt);
}
void
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index b68e96681522..cd05732f8eaa 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1335,6 +1335,7 @@ void nfsd_client_rmdir(struct dentry *dentry)
WARN_ON_ONCE(ret);
fsnotify_rmdir(dir, dentry);
d_delete(dentry);
+ dput(dentry);
inode_unlock(dir);
}
@@ -1424,6 +1425,18 @@ static struct file_system_type nfsd_fs_type = {
};
MODULE_ALIAS_FS("nfsd");
+int get_nfsdfs(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct vfsmount *mnt;
+
+ mnt = vfs_kern_mount(&nfsd_fs_type, SB_KERNMOUNT, "nfsd", NULL);
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
+ nn->nfsd_mnt = mnt;
+ return 0;
+}
+
#ifdef CONFIG_PROC_FS
static int create_proc_exports_entry(void)
{
@@ -1451,7 +1464,6 @@ unsigned int nfsd_net_id;
static __net_init int nfsd_init_net(struct net *net)
{
int retval;
- struct vfsmount *mnt;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
retval = nfsd_export_init(net);
@@ -1478,16 +1490,8 @@ static __net_init int nfsd_init_net(struct net *net)
init_waitqueue_head(&nn->ntf_wq);
seqlock_init(&nn->boot_lock);
- mnt = vfs_kern_mount(&nfsd_fs_type, SB_KERNMOUNT, "nfsd", NULL);
- if (IS_ERR(mnt)) {
- retval = PTR_ERR(mnt);
- goto out_mount_err;
- }
- nn->nfsd_mnt = mnt;
return 0;
-out_mount_err:
- nfsd_reply_cache_shutdown(nn);
out_drc_error:
nfsd_idmap_shutdown(net);
out_idmap_error:
@@ -1500,7 +1504,6 @@ static __net_exit void nfsd_exit_net(struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- mntput(nn->nfsd_mnt);
nfsd_reply_cache_shutdown(nn);
nfsd_idmap_shutdown(net);
nfsd_export_shutdown(net);
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 36cdd81b6688..57c832d1b30f 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -90,6 +90,8 @@ void nfsd_destroy(struct net *net);
bool i_am_nfsd(void);
+int get_nfsdfs(struct net *);
+
struct nfsdfs_client {
struct kref cl_ref;
void (*cl_release)(struct kref *kref);
@@ -100,6 +102,7 @@ struct dentry *nfsd_client_mkdir(struct nfsd_net *nn,
struct nfsdfs_client *ncl, u32 id, const struct tree_descr *);
void nfsd_client_rmdir(struct dentry *dentry);
+
#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
#ifdef CONFIG_NFSD_V2_ACL
extern const struct svc_version nfsd_acl_version2;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index c3fbab1753ec..d22a056da477 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1226,6 +1226,9 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
iap->ia_mode = 0;
iap->ia_mode = (iap->ia_mode & S_IALLUGO) | type;
+ if (!IS_POSIXACL(dirp))
+ iap->ia_mode &= ~current_umask();
+
err = 0;
host_err = 0;
switch (type) {
@@ -1458,6 +1461,9 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
goto out;
}
+ if (!IS_POSIXACL(dirp))
+ iap->ia_mode &= ~current_umask();
+
host_err = vfs_create(dirp, dchild, iap->ia_mode, true);
if (host_err < 0) {
fh_drop_write(fhp);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 152a0fc4e905..751bc4dc7466 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -689,6 +689,12 @@ static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
&ocfs2_nfs_sync_lops, osb);
}
+static void ocfs2_nfs_sync_lock_init(struct ocfs2_super *osb)
+{
+ ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
+ init_rwsem(&osb->nfs_sync_rwlock);
+}
+
void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb)
{
struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
@@ -2855,6 +2861,11 @@ int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex)
if (ocfs2_is_hard_readonly(osb))
return -EROFS;
+ if (ex)
+ down_write(&osb->nfs_sync_rwlock);
+ else
+ down_read(&osb->nfs_sync_rwlock);
+
if (ocfs2_mount_local(osb))
return 0;
@@ -2873,6 +2884,10 @@ void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex)
if (!ocfs2_mount_local(osb))
ocfs2_cluster_unlock(osb, lockres,
ex ? LKM_EXMODE : LKM_PRMODE);
+ if (ex)
+ up_write(&osb->nfs_sync_rwlock);
+ else
+ up_read(&osb->nfs_sync_rwlock);
}
int ocfs2_trim_fs_lock(struct ocfs2_super *osb,
@@ -3340,7 +3355,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
local:
ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
- ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
+ ocfs2_nfs_sync_lock_init(osb);
ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
osb->cconn = conn;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index ee5d98516212..2dd71d626196 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -395,6 +395,7 @@ struct ocfs2_super
struct ocfs2_lock_res osb_super_lockres;
struct ocfs2_lock_res osb_rename_lockres;
struct ocfs2_lock_res osb_nfs_sync_lockres;
+ struct rw_semaphore nfs_sync_rwlock;
struct ocfs2_lock_res osb_trim_fs_lockres;
struct mutex obs_trim_fs_mutex;
struct ocfs2_dlm_debug *osb_dlm_debug;
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 0dd8c41bafd4..19137c6d087b 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -290,7 +290,7 @@
#define OCFS2_MAX_SLOTS 255
/* Slot map indicator for an empty slot */
-#define OCFS2_INVALID_SLOT -1
+#define OCFS2_INVALID_SLOT ((u16)-1)
#define OCFS2_VOL_UUID_LEN 16
#define OCFS2_MAX_VOL_LABEL_LEN 64
@@ -326,8 +326,8 @@ struct ocfs2_system_inode_info {
enum {
BAD_BLOCK_SYSTEM_INODE = 0,
GLOBAL_INODE_ALLOC_SYSTEM_INODE,
+#define OCFS2_FIRST_ONLINE_SYSTEM_INODE GLOBAL_INODE_ALLOC_SYSTEM_INODE
SLOT_MAP_SYSTEM_INODE,
-#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
HEARTBEAT_SYSTEM_INODE,
GLOBAL_BITMAP_SYSTEM_INODE,
USER_QUOTA_SYSTEM_INODE,
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 4836becb7578..45745cc3408a 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -2825,9 +2825,12 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
goto bail;
}
- inode_alloc_inode =
- ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
- suballoc_slot);
+ if (suballoc_slot == (u16)OCFS2_INVALID_SLOT)
+ inode_alloc_inode = ocfs2_get_system_file_inode(osb,
+ GLOBAL_INODE_ALLOC_SYSTEM_INODE, suballoc_slot);
+ else
+ inode_alloc_inode = ocfs2_get_system_file_inode(osb,
+ INODE_ALLOC_SYSTEM_INODE, suballoc_slot);
if (!inode_alloc_inode) {
/* the error code could be inaccurate, but we are not able to
* get the correct one. */
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 79dd052c7dbf..5e0cde85bd6b 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -895,7 +895,7 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
return err;
}
-int ovl_copy_up_flags(struct dentry *dentry, int flags)
+static int ovl_copy_up_flags(struct dentry *dentry, int flags)
{
int err = 0;
const struct cred *old_cred = ovl_override_creds(dentry->d_sb);
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 8f4286450f92..0e696f72cf65 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -476,7 +476,7 @@ static struct dentry *ovl_lookup_real_inode(struct super_block *sb,
if (IS_ERR_OR_NULL(this))
return this;
- if (WARN_ON(ovl_dentry_real_at(this, layer->idx) != real)) {
+ if (ovl_dentry_real_at(this, layer->idx) != real) {
dput(this);
this = ERR_PTR(-EIO);
}
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 01820e654a21..0d940e29d62b 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -33,13 +33,16 @@ static char ovl_whatisit(struct inode *inode, struct inode *realinode)
return 'm';
}
+/* No atime modificaton nor notify on underlying */
+#define OVL_OPEN_FLAGS (O_NOATIME | FMODE_NONOTIFY)
+
static struct file *ovl_open_realfile(const struct file *file,
struct inode *realinode)
{
struct inode *inode = file_inode(file);
struct file *realfile;
const struct cred *old_cred;
- int flags = file->f_flags | O_NOATIME | FMODE_NONOTIFY;
+ int flags = file->f_flags | OVL_OPEN_FLAGS;
int acc_mode = ACC_MODE(flags);
int err;
@@ -72,8 +75,7 @@ static int ovl_change_flags(struct file *file, unsigned int flags)
struct inode *inode = file_inode(file);
int err;
- /* No atime modificaton on underlying */
- flags |= O_NOATIME | FMODE_NONOTIFY;
+ flags |= OVL_OPEN_FLAGS;
/* If some flag changed that cannot be changed then something's amiss */
if (WARN_ON((file->f_flags ^ flags) & ~OVL_SETFL_MASK))
@@ -126,7 +128,7 @@ static int ovl_real_fdget_meta(const struct file *file, struct fd *real,
}
/* Did the flags change since open? */
- if (unlikely((file->f_flags ^ real->file->f_flags) & ~O_NOATIME))
+ if (unlikely((file->f_flags ^ real->file->f_flags) & ~OVL_OPEN_FLAGS))
return ovl_change_flags(real->file, file->f_flags);
return 0;
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 3566282a9199..f7d4358db637 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -389,7 +389,7 @@ invalid:
}
static int ovl_check_origin(struct ovl_fs *ofs, struct dentry *upperdentry,
- struct ovl_path **stackp, unsigned int *ctrp)
+ struct ovl_path **stackp)
{
struct ovl_fh *fh = ovl_get_fh(upperdentry, OVL_XATTR_ORIGIN);
int err;
@@ -406,10 +406,6 @@ static int ovl_check_origin(struct ovl_fs *ofs, struct dentry *upperdentry,
return err;
}
- if (WARN_ON(*ctrp))
- return -EIO;
-
- *ctrp = 1;
return 0;
}
@@ -861,8 +857,6 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
goto out;
}
if (upperdentry && !d.is_dir) {
- unsigned int origin_ctr = 0;
-
/*
* Lookup copy up origin by decoding origin file handle.
* We may get a disconnected dentry, which is fine,
@@ -873,8 +867,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
* number - it's the same as if we held a reference
* to a dentry in lower layer that was moved under us.
*/
- err = ovl_check_origin(ofs, upperdentry, &origin_path,
- &origin_ctr);
+ err = ovl_check_origin(ofs, upperdentry, &origin_path);
if (err)
goto out_put_upper;
@@ -1073,6 +1066,10 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
upperredirect = NULL;
goto out_free_oe;
}
+ err = ovl_check_metacopy_xattr(upperdentry);
+ if (err < 0)
+ goto out_free_oe;
+ uppermetacopy = err;
}
if (upperdentry || ctr) {
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index b725c7f15ff4..29bc1ec699e7 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -483,7 +483,6 @@ void ovl_aio_request_cache_destroy(void);
/* copy_up.c */
int ovl_copy_up(struct dentry *dentry);
int ovl_copy_up_with_data(struct dentry *dentry);
-int ovl_copy_up_flags(struct dentry *dentry, int flags);
int ovl_maybe_copy_up(struct dentry *dentry, int flags);
int ovl_copy_xattr(struct dentry *old, struct dentry *new);
int ovl_set_attr(struct dentry *upper, struct kstat *stat);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 91476bc422f9..4b38141c2985 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -580,12 +580,19 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
}
}
- /* Workdir is useless in non-upper mount */
- if (!config->upperdir && config->workdir) {
- pr_info("option \"workdir=%s\" is useless in a non-upper mount, ignore\n",
- config->workdir);
- kfree(config->workdir);
- config->workdir = NULL;
+ /* Workdir/index are useless in non-upper mount */
+ if (!config->upperdir) {
+ if (config->workdir) {
+ pr_info("option \"workdir=%s\" is useless in a non-upper mount, ignore\n",
+ config->workdir);
+ kfree(config->workdir);
+ config->workdir = NULL;
+ }
+ if (config->index && index_opt) {
+ pr_info("option \"index=on\" is useless in a non-upper mount, ignore\n");
+ index_opt = false;
+ }
+ config->index = false;
}
err = ovl_parse_redirect_mode(config, config->redirect_mode);
@@ -622,11 +629,13 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
/* Resolve nfs_export -> index dependency */
if (config->nfs_export && !config->index) {
- if (nfs_export_opt && index_opt) {
+ if (!config->upperdir && config->redirect_follow) {
+ pr_info("NFS export requires \"redirect_dir=nofollow\" on non-upper mount, falling back to nfs_export=off.\n");
+ config->nfs_export = false;
+ } else if (nfs_export_opt && index_opt) {
pr_err("conflicting options: nfs_export=on,index=off\n");
return -EINVAL;
- }
- if (index_opt) {
+ } else if (index_opt) {
/*
* There was an explicit index=off that resulted
* in this conflict.
@@ -1352,8 +1361,15 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs,
goto out;
}
+ /* index dir will act also as workdir */
+ iput(ofs->workdir_trap);
+ ofs->workdir_trap = NULL;
+ dput(ofs->workdir);
+ ofs->workdir = NULL;
ofs->indexdir = ovl_workdir_create(ofs, OVL_INDEXDIR_NAME, true);
if (ofs->indexdir) {
+ ofs->workdir = dget(ofs->indexdir);
+
err = ovl_setup_trap(sb, ofs->indexdir, &ofs->indexdir_trap,
"indexdir");
if (err)
@@ -1396,6 +1412,18 @@ static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid)
if (!ofs->config.nfs_export && !ovl_upper_mnt(ofs))
return true;
+ /*
+ * We allow using single lower with null uuid for index and nfs_export
+ * for example to support those features with single lower squashfs.
+ * To avoid regressions in setups of overlay with re-formatted lower
+ * squashfs, do not allow decoding origin with lower null uuid unless
+ * user opted-in to one of the new features that require following the
+ * lower inode of non-dir upper.
+ */
+ if (!ofs->config.index && !ofs->config.metacopy && !ofs->config.xino &&
+ uuid_is_null(uuid))
+ return false;
+
for (i = 0; i < ofs->numfs; i++) {
/*
* We use uuid to associate an overlay lower file handle with a
@@ -1493,14 +1521,23 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs,
if (err < 0)
goto out;
+ /*
+ * Check if lower root conflicts with this overlay layers before
+ * checking if it is in-use as upperdir/workdir of "another"
+ * mount, because we do not bother to check in ovl_is_inuse() if
+ * the upperdir/workdir is in fact in-use by our
+ * upperdir/workdir.
+ */
err = ovl_setup_trap(sb, stack[i].dentry, &trap, "lowerdir");
if (err)
goto out;
if (ovl_is_inuse(stack[i].dentry)) {
err = ovl_report_in_use(ofs, "lowerdir");
- if (err)
+ if (err) {
+ iput(trap);
goto out;
+ }
}
mnt = clone_private_mount(&stack[i]);
@@ -1575,10 +1612,6 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb,
if (!ofs->config.upperdir && numlower == 1) {
pr_err("at least 2 lowerdir are needed while upperdir nonexistent\n");
return ERR_PTR(-EINVAL);
- } else if (!ofs->config.upperdir && ofs->config.nfs_export &&
- ofs->config.redirect_follow) {
- pr_warn("NFS export requires \"redirect_dir=nofollow\" on non-upper mount, falling back to nfs_export=off.\n");
- ofs->config.nfs_export = false;
}
stack = kcalloc(numlower, sizeof(struct path), GFP_KERNEL);
@@ -1842,21 +1875,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
if (!ovl_upper_mnt(ofs))
sb->s_flags |= SB_RDONLY;
- if (!(ovl_force_readonly(ofs)) && ofs->config.index) {
- /* index dir will act also as workdir */
- dput(ofs->workdir);
- ofs->workdir = NULL;
- iput(ofs->workdir_trap);
- ofs->workdir_trap = NULL;
-
+ if (!ovl_force_readonly(ofs) && ofs->config.index) {
err = ovl_get_indexdir(sb, ofs, oe, &upperpath);
if (err)
goto out_free_oe;
/* Force r/o mount with no index dir */
- if (ofs->indexdir)
- ofs->workdir = dget(ofs->indexdir);
- else
+ if (!ofs->indexdir)
sb->s_flags |= SB_RDONLY;
}
diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c
index 9955d75c0585..ad31ec4ad627 100644
--- a/fs/proc/bootconfig.c
+++ b/fs/proc/bootconfig.c
@@ -26,8 +26,9 @@ static int boot_config_proc_show(struct seq_file *m, void *v)
static int __init copy_xbc_key_value_list(char *dst, size_t size)
{
struct xbc_node *leaf, *vnode;
- const char *val;
char *key, *end = dst + size;
+ const char *val;
+ char q;
int ret = 0;
key = kzalloc(XBC_KEYLEN_MAX, GFP_KERNEL);
@@ -41,16 +42,20 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size)
break;
dst += ret;
vnode = xbc_node_get_child(leaf);
- if (vnode && xbc_node_is_array(vnode)) {
+ if (vnode) {
xbc_array_for_each_value(vnode, val) {
- ret = snprintf(dst, rest(dst, end), "\"%s\"%s",
- val, vnode->next ? ", " : "\n");
+ if (strchr(val, '"'))
+ q = '\'';
+ else
+ q = '"';
+ ret = snprintf(dst, rest(dst, end), "%c%s%c%s",
+ q, val, q, vnode->next ? ", " : "\n");
if (ret < 0)
goto out;
dst += ret;
}
} else {
- ret = snprintf(dst, rest(dst, end), "\"%s\"\n", val);
+ ret = snprintf(dst, rest(dst, end), "\"\"\n");
if (ret < 0)
break;
dst += ret;
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 8ba492d44e68..e502414b3556 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -512,7 +512,8 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
* Using bounce buffer to bypass the
* hardened user copy kernel text checks.
*/
- if (probe_kernel_read(buf, (void *) start, tsz)) {
+ if (copy_from_kernel_nofault(buf, (void *)start,
+ tsz)) {
if (clear_user(buffer, tsz)) {
ret = -EFAULT;
goto out;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 42c5128c7d1c..6c1166ccdaea 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -566,8 +566,9 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf,
goto out;
/* don't even try if the size is too large */
- if (count > KMALLOC_MAX_SIZE)
- return -ENOMEM;
+ error = -ENOMEM;
+ if (count >= KMALLOC_MAX_SIZE)
+ goto out;
if (write) {
kbuf = memdup_user_nul(ubuf, count);
@@ -576,7 +577,6 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf,
goto out;
}
} else {
- error = -ENOMEM;
kbuf = kzalloc(count, GFP_KERNEL);
if (!kbuf)
goto out;
diff --git a/fs/read_write.c b/fs/read_write.c
index bbfa9b12b15e..4fb797822567 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -419,28 +419,42 @@ static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo
return ret;
}
-ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
- loff_t *pos)
+ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
{
+ mm_segment_t old_fs = get_fs();
+ ssize_t ret;
+
+ if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ)))
+ return -EINVAL;
+ if (!(file->f_mode & FMODE_CAN_READ))
+ return -EINVAL;
+
+ if (count > MAX_RW_COUNT)
+ count = MAX_RW_COUNT;
+ set_fs(KERNEL_DS);
if (file->f_op->read)
- return file->f_op->read(file, buf, count, pos);
+ ret = file->f_op->read(file, (void __user *)buf, count, pos);
else if (file->f_op->read_iter)
- return new_sync_read(file, buf, count, pos);
+ ret = new_sync_read(file, (void __user *)buf, count, pos);
else
- return -EINVAL;
+ ret = -EINVAL;
+ set_fs(old_fs);
+ if (ret > 0) {
+ fsnotify_access(file);
+ add_rchar(current, ret);
+ }
+ inc_syscr(current);
+ return ret;
}
ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
{
- mm_segment_t old_fs;
- ssize_t result;
+ ssize_t ret;
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- /* The cast to a user pointer is valid due to the set_fs() */
- result = vfs_read(file, (void __user *)buf, count, pos);
- set_fs(old_fs);
- return result;
+ ret = rw_verify_area(READ, file, pos, count);
+ if (ret)
+ return ret;
+ return __kernel_read(file, buf, count, pos);
}
EXPORT_SYMBOL(kernel_read);
@@ -456,17 +470,22 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
return -EFAULT;
ret = rw_verify_area(READ, file, pos, count);
- if (!ret) {
- if (count > MAX_RW_COUNT)
- count = MAX_RW_COUNT;
- ret = __vfs_read(file, buf, count, pos);
- if (ret > 0) {
- fsnotify_access(file);
- add_rchar(current, ret);
- }
- inc_syscr(current);
- }
+ if (ret)
+ return ret;
+ if (count > MAX_RW_COUNT)
+ count = MAX_RW_COUNT;
+ if (file->f_op->read)
+ ret = file->f_op->read(file, buf, count, pos);
+ else if (file->f_op->read_iter)
+ ret = new_sync_read(file, buf, count, pos);
+ else
+ ret = -EINVAL;
+ if (ret > 0) {
+ fsnotify_access(file);
+ add_rchar(current, ret);
+ }
+ inc_syscr(current);
return ret;
}
@@ -488,23 +507,15 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t
return ret;
}
-static ssize_t __vfs_write(struct file *file, const char __user *p,
- size_t count, loff_t *pos)
-{
- if (file->f_op->write)
- return file->f_op->write(file, p, count, pos);
- else if (file->f_op->write_iter)
- return new_sync_write(file, p, count, pos);
- else
- return -EINVAL;
-}
-
+/* caller is responsible for file_start_write/file_end_write */
ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
{
mm_segment_t old_fs;
const char __user *p;
ssize_t ret;
+ if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
+ return -EBADF;
if (!(file->f_mode & FMODE_CAN_WRITE))
return -EINVAL;
@@ -513,7 +524,12 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t
p = (__force const char __user *)buf;
if (count > MAX_RW_COUNT)
count = MAX_RW_COUNT;
- ret = __vfs_write(file, p, count, pos);
+ if (file->f_op->write)
+ ret = file->f_op->write(file, p, count, pos);
+ else if (file->f_op->write_iter)
+ ret = new_sync_write(file, p, count, pos);
+ else
+ ret = -EINVAL;
set_fs(old_fs);
if (ret > 0) {
fsnotify_modify(file);
@@ -522,21 +538,20 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t
inc_syscw(current);
return ret;
}
-EXPORT_SYMBOL(__kernel_write);
ssize_t kernel_write(struct file *file, const void *buf, size_t count,
loff_t *pos)
{
- mm_segment_t old_fs;
- ssize_t res;
+ ssize_t ret;
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- /* The cast to a user pointer is valid due to the set_fs() */
- res = vfs_write(file, (__force const char __user *)buf, count, pos);
- set_fs(old_fs);
+ ret = rw_verify_area(WRITE, file, pos, count);
+ if (ret)
+ return ret;
- return res;
+ file_start_write(file);
+ ret = __kernel_write(file, buf, count, pos);
+ file_end_write(file);
+ return ret;
}
EXPORT_SYMBOL(kernel_write);
@@ -552,19 +567,23 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
return -EFAULT;
ret = rw_verify_area(WRITE, file, pos, count);
- if (!ret) {
- if (count > MAX_RW_COUNT)
- count = MAX_RW_COUNT;
- file_start_write(file);
- ret = __vfs_write(file, buf, count, pos);
- if (ret > 0) {
- fsnotify_modify(file);
- add_wchar(current, ret);
- }
- inc_syscw(current);
- file_end_write(file);
+ if (ret)
+ return ret;
+ if (count > MAX_RW_COUNT)
+ count = MAX_RW_COUNT;
+ file_start_write(file);
+ if (file->f_op->write)
+ ret = file->f_op->write(file, buf, count, pos);
+ else if (file->f_op->write_iter)
+ ret = new_sync_write(file, buf, count, pos);
+ else
+ ret = -EINVAL;
+ if (ret > 0) {
+ fsnotify_modify(file);
+ add_wchar(current, ret);
}
-
+ inc_syscw(current);
+ file_end_write(file);
return ret;
}
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 64f61330564a..76bb1c846845 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -175,7 +175,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length,
/* Extract the length of the metadata block */
data = page_address(bvec->bv_page) + bvec->bv_offset;
length = data[offset];
- if (offset <= bvec->bv_len - 1) {
+ if (offset < bvec->bv_len - 1) {
length |= data[offset + 1] << 8;
} else {
if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all))) {
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index b43f0e8f43f2..9ed90368ab31 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -671,7 +671,8 @@ xlog_cil_push_work(
/*
* Wake up any background push waiters now this context is being pushed.
*/
- wake_up_all(&ctx->push_wait);
+ if (ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log))
+ wake_up_all(&cil->xc_push_wait);
/*
* Check if we've anything to push. If there is nothing, then we don't
@@ -743,13 +744,12 @@ xlog_cil_push_work(
/*
* initialise the new context and attach it to the CIL. Then attach
- * the current context to the CIL committing lsit so it can be found
+ * the current context to the CIL committing list so it can be found
* during log forces to extract the commit lsn of the sequence that
* needs to be forced.
*/
INIT_LIST_HEAD(&new_ctx->committing);
INIT_LIST_HEAD(&new_ctx->busy_extents);
- init_waitqueue_head(&new_ctx->push_wait);
new_ctx->sequence = ctx->sequence + 1;
new_ctx->cil = cil;
cil->xc_ctx = new_ctx;
@@ -937,7 +937,7 @@ xlog_cil_push_background(
if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) {
trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket);
ASSERT(cil->xc_ctx->space_used < log->l_logsize);
- xlog_wait(&cil->xc_ctx->push_wait, &cil->xc_push_lock);
+ xlog_wait(&cil->xc_push_wait, &cil->xc_push_lock);
return;
}
@@ -1216,12 +1216,12 @@ xlog_cil_init(
INIT_LIST_HEAD(&cil->xc_committing);
spin_lock_init(&cil->xc_cil_lock);
spin_lock_init(&cil->xc_push_lock);
+ init_waitqueue_head(&cil->xc_push_wait);
init_rwsem(&cil->xc_ctx_lock);
init_waitqueue_head(&cil->xc_commit_wait);
INIT_LIST_HEAD(&ctx->committing);
INIT_LIST_HEAD(&ctx->busy_extents);
- init_waitqueue_head(&ctx->push_wait);
ctx->sequence = 1;
ctx->cil = cil;
cil->xc_ctx = ctx;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index ec22c7a3867f..75a62870b63a 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -240,7 +240,6 @@ struct xfs_cil_ctx {
struct xfs_log_vec *lv_chain; /* logvecs being pushed */
struct list_head iclog_entry;
struct list_head committing; /* ctx committing list */
- wait_queue_head_t push_wait; /* background push throttle */
struct work_struct discard_endio_work;
};
@@ -274,6 +273,7 @@ struct xfs_cil {
wait_queue_head_t xc_commit_wait;
xfs_lsn_t xc_current_sequence;
struct work_struct xc_push_work;
+ wait_queue_head_t xc_push_wait; /* background push throttle */
} ____cacheline_aligned_in_smp;
/*
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 07bc42d62673..abfb17f88f9a 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -607,14 +607,14 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
int nr_pages;
ssize_t ret;
- nr_pages = iov_iter_npages(from, BIO_MAX_PAGES);
- if (!nr_pages)
- return 0;
-
max = queue_max_zone_append_sectors(bdev_get_queue(bdev));
max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize);
iov_iter_truncate(from, max);
+ nr_pages = iov_iter_npages(from, BIO_MAX_PAGES);
+ if (!nr_pages)
+ return 0;
+
bio = bio_alloc_bioset(GFP_NOFS, nr_pages, &fs_bio_set);
if (!bio)
return -ENOMEM;
@@ -1119,7 +1119,7 @@ static int zonefs_create_zgroup(struct zonefs_zone_data *zd,
char *file_name;
struct dentry *dir;
unsigned int n = 0;
- int ret = -ENOMEM;
+ int ret;
/* If the group is empty, there is nothing to do */
if (!zd->nr_zones[type])
@@ -1135,8 +1135,10 @@ static int zonefs_create_zgroup(struct zonefs_zone_data *zd,
zgroup_name = "seq";
dir = zonefs_create_inode(sb->s_root, zgroup_name, NULL, type);
- if (!dir)
+ if (!dir) {
+ ret = -ENOMEM;
goto free;
+ }
/*
* The first zone contains the super block: skip it.
@@ -1174,8 +1176,10 @@ static int zonefs_create_zgroup(struct zonefs_zone_data *zd,
* Use the file number within its group as file name.
*/
snprintf(file_name, ZONEFS_NAME_MAX - 1, "%u", n);
- if (!zonefs_create_inode(dir, file_name, zone, type))
+ if (!zonefs_create_inode(dir, file_name, zone, type)) {
+ ret = -ENOMEM;
goto free;
+ }
n++;
}