summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_inode.c34
-rw-r--r--fs/9p/vfs_inode_dotl.c15
-rw-r--r--fs/afs/dir_edit.c4
-rw-r--r--fs/afs/dir_search.c2
-rw-r--r--fs/afs/internal.h6
-rw-r--r--fs/afs/mntpt.c3
-rw-r--r--fs/bpf_fs_kfuncs.c2
-rw-r--r--fs/btrfs/tree-log.c2
-rw-r--r--fs/configfs/dir.c5
-rw-r--r--fs/configfs/symlink.c33
-rw-r--r--fs/cramfs/inode.c2
-rw-r--r--fs/dcache.c30
-rw-r--r--fs/ecryptfs/dentry.c14
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h27
-rw-r--r--fs/ecryptfs/file.c15
-rw-r--r--fs/ecryptfs/inode.c19
-rw-r--r--fs/ecryptfs/main.c24
-rw-r--r--fs/exfat/balloc.c85
-rw-r--r--fs/exfat/dir.c160
-rw-r--r--fs/exfat/exfat_fs.h7
-rw-r--r--fs/exfat/exfat_raw.h6
-rw-r--r--fs/exfat/fatent.c11
-rw-r--r--fs/exfat/file.c52
-rw-r--r--fs/exfat/inode.c2
-rw-r--r--fs/exfat/namei.c4
-rw-r--r--fs/exfat/nls.c2
-rw-r--r--fs/exfat/super.c68
-rw-r--r--fs/ext4/Kconfig27
-rw-r--r--fs/ext4/ext4.h28
-rw-r--r--fs/ext4/fast_commit.c2
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/fsmap.c14
-rw-r--r--fs/ext4/indirect.c2
-rw-r--r--fs/ext4/inode.c47
-rw-r--r--fs/ext4/ioctl.c312
-rw-r--r--fs/ext4/mballoc.c10
-rw-r--r--fs/ext4/mmp.c6
-rw-r--r--fs/ext4/move_extent.c2
-rw-r--r--fs/ext4/orphan.c19
-rw-r--r--fs/ext4/super.c38
-rw-r--r--fs/ext4/xattr.c21
-rw-r--r--fs/f2fs/checkpoint.c53
-rw-r--r--fs/f2fs/compress.c43
-rw-r--r--fs/f2fs/data.c59
-rw-r--r--fs/f2fs/dir.c17
-rw-r--r--fs/f2fs/extent_cache.c15
-rw-r--r--fs/f2fs/f2fs.h88
-rw-r--r--fs/f2fs/file.c49
-rw-r--r--fs/f2fs/gc.c25
-rw-r--r--fs/f2fs/node.c77
-rw-r--r--fs/f2fs/node.h1
-rw-r--r--fs/f2fs/recovery.c2
-rw-r--r--fs/f2fs/segment.c30
-rw-r--r--fs/f2fs/segment.h28
-rw-r--r--fs/f2fs/super.c121
-rw-r--r--fs/f2fs/sysfs.c119
-rw-r--r--fs/fat/dir.c7
-rw-r--r--fs/file_table.c6
-rw-r--r--fs/fs_context.c17
-rw-r--r--fs/fuse/Kconfig2
-rw-r--r--fs/fuse/Makefile5
-rw-r--r--fs/fuse/backing.c179
-rw-r--r--fs/fuse/cuse.c3
-rw-r--r--fs/fuse/dev.c227
-rw-r--r--fs/fuse/dev_uring.c8
-rw-r--r--fs/fuse/dir.c21
-rw-r--r--fs/fuse/file.c86
-rw-r--r--fs/fuse/fuse_dev_i.h13
-rw-r--r--fs/fuse/fuse_i.h70
-rw-r--r--fs/fuse/inode.c76
-rw-r--r--fs/fuse/iomode.c3
-rw-r--r--fs/fuse/passthrough.c167
-rw-r--r--fs/fuse/trace.c13
-rw-r--r--fs/fuse/virtio_fs.c12
-rw-r--r--fs/gfs2/inode.c26
-rw-r--r--fs/internal.h6
-rw-r--r--fs/jbd2/checkpoint.c2
-rw-r--r--fs/jfs/inode.c8
-rw-r--r--fs/jfs/jfs_dtree.c4
-rw-r--r--fs/jfs/jfs_logmgr.c1
-rw-r--r--fs/jfs/jfs_mount.c10
-rw-r--r--fs/jfs/jfs_txnmgr.c9
-rw-r--r--fs/lockd/svc.c6
-rw-r--r--fs/mount.h39
-rw-r--r--fs/namei.c12
-rw-r--r--fs/namespace.c1004
-rw-r--r--fs/nfs/blocklayout/blocklayout.c8
-rw-r--r--fs/nfs/blocklayout/dev.c8
-rw-r--r--fs/nfs/callback.c10
-rw-r--r--fs/nfs/dir.c26
-rw-r--r--fs/nfs/file.c29
-rw-r--r--fs/nfs/filelayout/filelayout.c10
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c10
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c786
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h64
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c115
-rw-r--r--fs/nfs/fs_context.c3
-rw-r--r--fs/nfs/inode.c15
-rw-r--r--fs/nfs/internal.h10
-rw-r--r--fs/nfs/localio.c405
-rw-r--r--fs/nfs/namespace.c3
-rw-r--r--fs/nfs/nfs2xdr.c2
-rw-r--r--fs/nfs/nfs3xdr.c2
-rw-r--r--fs/nfs/nfs42proc.c4
-rw-r--r--fs/nfs/nfs42xdr.c2
-rw-r--r--fs/nfs/nfs4file.c1
-rw-r--r--fs/nfs/nfs4proc.c12
-rw-r--r--fs/nfs/nfs4state.c3
-rw-r--r--fs/nfs/nfs4super.c44
-rw-r--r--fs/nfs/nfs4xdr.c4
-rw-r--r--fs/nfs/nfstrace.h215
-rw-r--r--fs/nfs/write.c34
-rw-r--r--fs/nfsd/export.c4
-rw-r--r--fs/nfsd/export.h2
-rw-r--r--fs/nfsd/filecache.c34
-rw-r--r--fs/nfsd/filecache.h4
-rw-r--r--fs/nfsd/localio.c11
-rw-r--r--fs/nfsd/nfsctl.c139
-rw-r--r--fs/nfsd/nfssvc.c7
-rw-r--r--fs/nfsd/trace.h27
-rw-r--r--fs/nfsd/vfs.h4
-rw-r--r--fs/notify/fanotify/fanotify.h2
-rw-r--r--fs/notify/fanotify/fanotify_user.c105
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c2
-rw-r--r--fs/nsfs.c2
-rw-r--r--fs/ntfs3/bitmap.c1
-rw-r--r--fs/ntfs3/file.c28
-rw-r--r--fs/ntfs3/index.c10
-rw-r--r--fs/ntfs3/inode.c1
-rw-r--r--fs/ntfs3/ntfs_fs.h2
-rw-r--r--fs/ntfs3/run.c12
-rw-r--r--fs/ocfs2/alloc.c3
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c11
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c1
-rw-r--r--fs/ocfs2/inode.c8
-rw-r--r--fs/ocfs2/ioctl.c18
-rw-r--r--fs/ocfs2/move_extents.c8
-rw-r--r--fs/ocfs2/ocfs2_fs.h2
-rw-r--r--fs/ocfs2/stack_user.c1
-rw-r--r--fs/ocfs2/sysfile.c12
-rw-r--r--fs/open.c20
-rw-r--r--fs/orangefs/namei.c10
-rw-r--r--fs/orangefs/orangefs-debugfs.c11
-rw-r--r--fs/orangefs/orangefs-kernel.h2
-rw-r--r--fs/orangefs/xattr.c12
-rw-r--r--fs/overlayfs/copy_up.c4
-rw-r--r--fs/overlayfs/dir.c29
-rw-r--r--fs/overlayfs/file.c2
-rw-r--r--fs/overlayfs/inode.c1
-rw-r--r--fs/overlayfs/namei.c17
-rw-r--r--fs/overlayfs/overlayfs.h16
-rw-r--r--fs/overlayfs/ovl_entry.h1
-rw-r--r--fs/overlayfs/params.c15
-rw-r--r--fs/overlayfs/params.h1
-rw-r--r--fs/overlayfs/readdir.c126
-rw-r--r--fs/overlayfs/super.c66
-rw-r--r--fs/overlayfs/util.c14
-rw-r--r--fs/pidfs.c2
-rw-r--r--fs/pnode.c75
-rw-r--r--fs/pnode.h1
-rw-r--r--fs/proc/base.c2
-rw-r--r--fs/quota/dquot.c10
-rw-r--r--fs/smb/client/Kconfig1
-rw-r--r--fs/smb/client/cached_dir.c50
-rw-r--r--fs/smb/client/cached_dir.h16
-rw-r--r--fs/smb/client/cifs_debug.c41
-rw-r--r--fs/smb/client/cifsencrypt.c8
-rw-r--r--fs/smb/client/cifsfs.c40
-rw-r--r--fs/smb/client/dir.c54
-rw-r--r--fs/smb/client/fs_context.c11
-rw-r--r--fs/smb/client/inode.c2
-rw-r--r--fs/smb/client/readdir.c40
-rw-r--r--fs/smb/client/smb2ops.c22
-rw-r--r--fs/smb/client/smb2pdu.c18
-rw-r--r--fs/smb/client/transport.c13
-rw-r--r--fs/smb/common/Makefile1
-rw-r--r--fs/smb/common/arc4.h23
-rw-r--r--fs/smb/common/cifs_arc4.c75
-rw-r--r--fs/smb/server/Kconfig1
-rw-r--r--fs/smb/server/auth.c9
-rw-r--r--fs/smb/server/connection.c23
-rw-r--r--fs/smb/server/connection.h6
-rw-r--r--fs/smb/server/ksmbd_netlink.h5
-rw-r--r--fs/smb/server/mgmt/share_config.c2
-rw-r--r--fs/smb/server/mgmt/user_session.c28
-rw-r--r--fs/smb/server/server.h1
-rw-r--r--fs/smb/server/smb2pdu.c7
-rw-r--r--fs/smb/server/transport_ipc.c3
-rw-r--r--fs/smb/server/transport_rdma.c5
-rw-r--r--fs/smb/server/transport_tcp.c98
-rw-r--r--fs/smb/server/vfs.c24
-rw-r--r--fs/smb/server/vfs.h6
-rw-r--r--fs/squashfs/file.c137
-rw-r--r--fs/squashfs/inode.c39
-rw-r--r--fs/squashfs/squashfs.h1
-rw-r--r--fs/squashfs/squashfs_fs.h1
-rw-r--r--fs/squashfs/squashfs_fs_i.h2
-rw-r--r--fs/stat.c2
-rw-r--r--fs/super.c3
-rw-r--r--fs/udf/inode.c3
-rw-r--r--fs/vboxsf/dir.c25
201 files changed, 5003 insertions, 2499 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 399d455d50d6..d0c77ec31b1d 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -768,22 +768,18 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
struct v9fs_inode __maybe_unused *v9inode;
struct v9fs_session_info *v9ses;
struct p9_fid *fid;
- struct dentry *res = NULL;
struct inode *inode;
int p9_omode;
if (d_in_lookup(dentry)) {
- res = v9fs_vfs_lookup(dir, dentry, 0);
- if (IS_ERR(res))
- return PTR_ERR(res);
-
- if (res)
- dentry = res;
+ struct dentry *res = v9fs_vfs_lookup(dir, dentry, 0);
+ if (res || d_really_is_positive(dentry))
+ return finish_no_open(file, res);
}
/* Only creates */
- if (!(flags & O_CREAT) || d_really_is_positive(dentry))
- return finish_no_open(file, res);
+ if (!(flags & O_CREAT))
+ return finish_no_open(file, NULL);
v9ses = v9fs_inode2v9ses(dir);
perm = unixmode2p9mode(v9ses, mode);
@@ -795,17 +791,17 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
"write-only file with writeback enabled, creating w/ O_RDWR\n");
}
fid = v9fs_create(v9ses, dir, dentry, NULL, perm, p9_omode);
- if (IS_ERR(fid)) {
- err = PTR_ERR(fid);
- goto error;
- }
+ if (IS_ERR(fid))
+ return PTR_ERR(fid);
v9fs_invalidate_inode_attr(dir);
inode = d_inode(dentry);
v9inode = V9FS_I(inode);
err = finish_open(file, dentry, generic_file_open);
- if (err)
- goto error;
+ if (unlikely(err)) {
+ p9_fid_put(fid);
+ return err;
+ }
file->private_data = fid;
#ifdef CONFIG_9P_FSCACHE
@@ -818,13 +814,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
v9fs_open_fid_add(inode, &fid);
file->f_mode |= FMODE_CREATED;
-out:
- dput(res);
- return err;
-
-error:
- p9_fid_put(fid);
- goto out;
+ return 0;
}
/**
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 5b5fda617b80..be297e335468 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -238,20 +238,16 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
struct p9_fid *dfid = NULL, *ofid = NULL;
struct v9fs_session_info *v9ses;
struct posix_acl *pacl = NULL, *dacl = NULL;
- struct dentry *res = NULL;
if (d_in_lookup(dentry)) {
- res = v9fs_vfs_lookup(dir, dentry, 0);
- if (IS_ERR(res))
- return PTR_ERR(res);
-
- if (res)
- dentry = res;
+ struct dentry *res = v9fs_vfs_lookup(dir, dentry, 0);
+ if (res || d_really_is_positive(dentry))
+ return finish_no_open(file, res);
}
/* Only creates */
- if (!(flags & O_CREAT) || d_really_is_positive(dentry))
- return finish_no_open(file, res);
+ if (!(flags & O_CREAT))
+ return finish_no_open(file, NULL);
v9ses = v9fs_inode2v9ses(dir);
@@ -337,7 +333,6 @@ out:
p9_fid_put(ofid);
p9_fid_put(fid);
v9fs_put_acl(dacl, pacl);
- dput(res);
return err;
}
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index 4b1342c72089..fd3aa9f97ce6 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -239,7 +239,7 @@ static void afs_edit_init_block(union afs_xdr_dir_block *meta,
* The caller must hold the inode locked.
*/
void afs_edit_dir_add(struct afs_vnode *vnode,
- struct qstr *name, struct afs_fid *new_fid,
+ const struct qstr *name, struct afs_fid *new_fid,
enum afs_edit_dir_reason why)
{
union afs_xdr_dir_block *meta, *block;
@@ -391,7 +391,7 @@ error:
* The caller must hold the inode locked.
*/
void afs_edit_dir_remove(struct afs_vnode *vnode,
- struct qstr *name, enum afs_edit_dir_reason why)
+ const struct qstr *name, enum afs_edit_dir_reason why)
{
union afs_xdr_dir_block *meta, *block, *pblock;
union afs_xdr_dirent *de, *pde;
diff --git a/fs/afs/dir_search.c b/fs/afs/dir_search.c
index b25bd892db4d..d2516e55b5ed 100644
--- a/fs/afs/dir_search.c
+++ b/fs/afs/dir_search.c
@@ -188,7 +188,7 @@ bad:
/*
* Search the appropriate hash chain in the contents of an AFS directory.
*/
-int afs_dir_search(struct afs_vnode *dvnode, struct qstr *name,
+int afs_dir_search(struct afs_vnode *dvnode, const struct qstr *name,
struct afs_fid *_fid, afs_dataversion_t *_dir_version)
{
struct afs_dir_iter iter = { .dvnode = dvnode, };
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 444a3ea4fdf6..a45ae5c2ef8a 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1099,9 +1099,9 @@ int afs_single_writepages(struct address_space *mapping,
/*
* dir_edit.c
*/
-extern void afs_edit_dir_add(struct afs_vnode *, struct qstr *, struct afs_fid *,
+extern void afs_edit_dir_add(struct afs_vnode *, const struct qstr *, struct afs_fid *,
enum afs_edit_dir_reason);
-extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason);
+extern void afs_edit_dir_remove(struct afs_vnode *, const struct qstr *, enum afs_edit_dir_reason);
void afs_edit_dir_update(struct afs_vnode *vnode, const struct qstr *name,
struct afs_vnode *new_dvnode, enum afs_edit_dir_reason why);
void afs_mkdir_init_dir(struct afs_vnode *dvnode, struct afs_vnode *parent_vnode);
@@ -1114,7 +1114,7 @@ bool afs_dir_init_iter(struct afs_dir_iter *iter, const struct qstr *name);
union afs_xdr_dir_block *afs_dir_find_block(struct afs_dir_iter *iter, size_t block);
int afs_dir_search_bucket(struct afs_dir_iter *iter, const struct qstr *name,
struct afs_fid *_fid);
-int afs_dir_search(struct afs_vnode *dvnode, struct qstr *name,
+int afs_dir_search(struct afs_vnode *dvnode, const struct qstr *name,
struct afs_fid *_fid, afs_dataversion_t *_dir_version);
/*
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 9434a5399f2b..1ad048e6e164 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -137,7 +137,8 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
ret = -EINVAL;
if (content[size - 1] == '.')
- ret = vfs_parse_fs_string(fc, "source", content, size - 1);
+ ret = vfs_parse_fs_qstr(fc, "source",
+ &QSTR_LEN(content, size - 1));
do_delayed_call(&cleanup);
if (ret < 0)
return ret;
diff --git a/fs/bpf_fs_kfuncs.c b/fs/bpf_fs_kfuncs.c
index 1e36a12b88f7..5ace2511fec5 100644
--- a/fs/bpf_fs_kfuncs.c
+++ b/fs/bpf_fs_kfuncs.c
@@ -79,7 +79,7 @@ __bpf_kfunc void bpf_put_file(struct file *file)
* pathname in *buf*, including the NUL termination character. On error, a
* negative integer is returned.
*/
-__bpf_kfunc int bpf_path_d_path(struct path *path, char *buf, size_t buf__sz)
+__bpf_kfunc int bpf_path_d_path(const struct path *path, char *buf, size_t buf__sz)
{
int len;
char *ret;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 6aad6b65522b..621e0df097e3 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5829,7 +5829,7 @@ struct btrfs_dir_list {
* See process_dir_items_leaf() for details about why it is needed.
* This is a recursive operation - if an existing dentry corresponds to a
* directory, that directory's new entries are logged too (same behaviour as
- * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
+ * ext3/4, xfs, f2fs, nilfs2). Note that when logging the inodes
* the dentries point to we do not acquire their VFS lock, otherwise lockdep
* complains about the following circular lock dependency / possible deadlock:
*
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index f327fbb9a0ca..81f4f06bc87e 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1601,10 +1601,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
err = -ENOENT;
if (configfs_dirent_is_ready(parent_sd)) {
file->private_data = configfs_new_dirent(parent_sd, NULL, 0, NULL);
- if (IS_ERR(file->private_data))
- err = PTR_ERR(file->private_data);
- else
- err = 0;
+ err = PTR_ERR_OR_ZERO(file->private_data);
}
inode_unlock(d_inode(dentry));
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 69133ec1fac2..f3f79c67add5 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -114,26 +114,21 @@ static int create_link(struct config_item *parent_item,
}
-static int get_target(const char *symname, struct path *path,
- struct config_item **target, struct super_block *sb)
+static int get_target(const char *symname, struct config_item **target,
+ struct super_block *sb)
{
+ struct path path __free(path_put) = {};
int ret;
- ret = kern_path(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, path);
- if (!ret) {
- if (path->dentry->d_sb == sb) {
- *target = configfs_get_config_item(path->dentry);
- if (!*target) {
- ret = -ENOENT;
- path_put(path);
- }
- } else {
- ret = -EPERM;
- path_put(path);
- }
- }
-
- return ret;
+ ret = kern_path(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &path);
+ if (ret)
+ return ret;
+ if (path.dentry->d_sb != sb)
+ return -EPERM;
+ *target = configfs_get_config_item(path.dentry);
+ if (!*target)
+ return -ENOENT;
+ return 0;
}
@@ -141,7 +136,6 @@ int configfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, const char *symname)
{
int ret;
- struct path path;
struct configfs_dirent *sd;
struct config_item *parent_item;
struct config_item *target_item = NULL;
@@ -188,7 +182,7 @@ int configfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
* AV, a thoroughly annoyed bastard.
*/
inode_unlock(dir);
- ret = get_target(symname, &path, &target_item, dentry->d_sb);
+ ret = get_target(symname, &target_item, dentry->d_sb);
inode_lock(dir);
if (ret)
goto out_put;
@@ -210,7 +204,6 @@ int configfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
}
config_item_put(target_item);
- path_put(&path);
out_put:
config_item_put(parent_item);
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 12daa85ed941..ca54bf24b719 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -421,7 +421,7 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma)
vm_fault_t vmf;
unsigned long off = i * PAGE_SIZE;
vmf = vmf_insert_mixed(vma, vma->vm_start + off,
- address + off);
+ PHYS_PFN(address + off));
if (vmf & VM_FAULT_ERROR)
ret = vm_fault_to_errno(vmf, 0);
}
diff --git a/fs/dcache.c b/fs/dcache.c
index 65cc11939654..a067fa0a965a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1390,6 +1390,7 @@ struct check_mount {
unsigned int mounted;
};
+/* locks: mount_locked_reader && dentry->d_lock */
static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry)
{
struct check_mount *info = data;
@@ -1416,9 +1417,8 @@ int path_has_submounts(const struct path *parent)
{
struct check_mount data = { .mnt = parent->mnt, .mounted = 0 };
- read_seqlock_excl(&mount_lock);
+ guard(mount_locked_reader)();
d_walk(parent->dentry, &data, path_check_mount);
- read_sequnlock_excl(&mount_lock);
return data.mounted;
}
@@ -1717,13 +1717,13 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
dname = dentry->d_shortname.string;
}
- dentry->d_name.len = name->len;
- dentry->d_name.hash = name->hash;
+ dentry->__d_name.len = name->len;
+ dentry->__d_name.hash = name->hash;
memcpy(dname, name->name, name->len);
dname[name->len] = 0;
/* Make sure we always see the terminating NUL character */
- smp_store_release(&dentry->d_name.name, dname); /* ^^^ */
+ smp_store_release(&dentry->__d_name.name, dname); /* ^^^ */
dentry->d_flags = 0;
lockref_init(&dentry->d_lockref);
@@ -2743,15 +2743,15 @@ static void swap_names(struct dentry *dentry, struct dentry *target)
/*
* Both external: swap the pointers
*/
- swap(target->d_name.name, dentry->d_name.name);
+ swap(target->__d_name.name, dentry->__d_name.name);
} else {
/*
* dentry:internal, target:external. Steal target's
* storage and make target internal.
*/
- dentry->d_name.name = target->d_name.name;
+ dentry->__d_name.name = target->__d_name.name;
target->d_shortname = dentry->d_shortname;
- target->d_name.name = target->d_shortname.string;
+ target->__d_name.name = target->d_shortname.string;
}
} else {
if (unlikely(dname_external(dentry))) {
@@ -2759,9 +2759,9 @@ static void swap_names(struct dentry *dentry, struct dentry *target)
* dentry:external, target:internal. Give dentry's
* storage to target and make dentry internal
*/
- target->d_name.name = dentry->d_name.name;
+ target->__d_name.name = dentry->__d_name.name;
dentry->d_shortname = target->d_shortname;
- dentry->d_name.name = dentry->d_shortname.string;
+ dentry->__d_name.name = dentry->d_shortname.string;
} else {
/*
* Both are internal.
@@ -2771,7 +2771,7 @@ static void swap_names(struct dentry *dentry, struct dentry *target)
target->d_shortname.words[i]);
}
}
- swap(dentry->d_name.hash_len, target->d_name.hash_len);
+ swap(dentry->__d_name.hash_len, target->__d_name.hash_len);
}
static void copy_name(struct dentry *dentry, struct dentry *target)
@@ -2781,11 +2781,11 @@ static void copy_name(struct dentry *dentry, struct dentry *target)
old_name = external_name(dentry);
if (unlikely(dname_external(target))) {
atomic_inc(&external_name(target)->count);
- dentry->d_name = target->d_name;
+ dentry->__d_name = target->__d_name;
} else {
dentry->d_shortname = target->d_shortname;
- dentry->d_name.name = dentry->d_shortname.string;
- dentry->d_name.hash_len = target->d_name.hash_len;
+ dentry->__d_name.name = dentry->d_shortname.string;
+ dentry->__d_name.hash_len = target->__d_name.hash_len;
}
if (old_name && likely(atomic_dec_and_test(&old_name->count)))
kfree_rcu(old_name, head);
@@ -3134,7 +3134,7 @@ void d_mark_tmpfile(struct file *file, struct inode *inode)
!d_unlinked(dentry));
spin_lock(&dentry->d_parent->d_lock);
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
- dentry->d_name.len = sprintf(dentry->d_shortname.string, "#%llu",
+ dentry->__d_name.len = sprintf(dentry->d_shortname.string, "#%llu",
(unsigned long long)inode->i_ino);
spin_unlock(&dentry->d_lock);
spin_unlock(&dentry->d_parent->d_lock);
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 1dfd5b81d831..6648a924e31a 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -59,14 +59,6 @@ static int ecryptfs_d_revalidate(struct inode *dir, const struct qstr *name,
return rc;
}
-struct kmem_cache *ecryptfs_dentry_info_cache;
-
-static void ecryptfs_dentry_free_rcu(struct rcu_head *head)
-{
- kmem_cache_free(ecryptfs_dentry_info_cache,
- container_of(head, struct ecryptfs_dentry_info, rcu));
-}
-
/**
* ecryptfs_d_release
* @dentry: The ecryptfs dentry
@@ -75,11 +67,7 @@ static void ecryptfs_dentry_free_rcu(struct rcu_head *head)
*/
static void ecryptfs_d_release(struct dentry *dentry)
{
- struct ecryptfs_dentry_info *p = dentry->d_fsdata;
- if (p) {
- path_put(&p->lower_path);
- call_rcu(&p->rcu, ecryptfs_dentry_free_rcu);
- }
+ dput(dentry->d_fsdata);
}
const struct dentry_operations ecryptfs_dops = {
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 1f562e75d0e4..9e6ab0b41337 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -258,13 +258,6 @@ struct ecryptfs_inode_info {
struct ecryptfs_crypt_stat crypt_stat;
};
-/* dentry private data. Each dentry must keep track of a lower
- * vfsmount too. */
-struct ecryptfs_dentry_info {
- struct path lower_path;
- struct rcu_head rcu;
-};
-
/**
* ecryptfs_global_auth_tok - A key used to encrypt all new files under the mountpoint
* @flags: Status flags
@@ -348,6 +341,7 @@ struct ecryptfs_mount_crypt_stat {
/* superblock private data. */
struct ecryptfs_sb_info {
struct super_block *wsi_sb;
+ struct vfsmount *lower_mnt;
struct ecryptfs_mount_crypt_stat mount_crypt_stat;
};
@@ -494,22 +488,25 @@ ecryptfs_set_superblock_lower(struct super_block *sb,
}
static inline void
-ecryptfs_set_dentry_private(struct dentry *dentry,
- struct ecryptfs_dentry_info *dentry_info)
+ecryptfs_set_dentry_lower(struct dentry *dentry,
+ struct dentry *lower_dentry)
{
- dentry->d_fsdata = dentry_info;
+ dentry->d_fsdata = lower_dentry;
}
static inline struct dentry *
ecryptfs_dentry_to_lower(struct dentry *dentry)
{
- return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.dentry;
+ return dentry->d_fsdata;
}
-static inline const struct path *
-ecryptfs_dentry_to_lower_path(struct dentry *dentry)
+static inline struct path
+ecryptfs_lower_path(struct dentry *dentry)
{
- return &((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path;
+ return (struct path){
+ .mnt = ecryptfs_superblock_to_private(dentry->d_sb)->lower_mnt,
+ .dentry = ecryptfs_dentry_to_lower(dentry)
+ };
}
#define ecryptfs_printk(type, fmt, arg...) \
@@ -532,7 +529,6 @@ extern unsigned int ecryptfs_number_of_users;
extern struct kmem_cache *ecryptfs_auth_tok_list_item_cache;
extern struct kmem_cache *ecryptfs_file_info_cache;
-extern struct kmem_cache *ecryptfs_dentry_info_cache;
extern struct kmem_cache *ecryptfs_inode_info_cache;
extern struct kmem_cache *ecryptfs_sb_info_cache;
extern struct kmem_cache *ecryptfs_header_cache;
@@ -557,7 +553,6 @@ int ecryptfs_encrypt_and_encode_filename(
size_t *encoded_name_size,
struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
const char *name, size_t name_size);
-struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry);
void ecryptfs_dump_hex(char *data, int bytes);
int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
int sg_size);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 5f8f96da09fe..7929411837cf 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -33,13 +33,12 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
struct iov_iter *to)
{
ssize_t rc;
- const struct path *path;
struct file *file = iocb->ki_filp;
rc = generic_file_read_iter(iocb, to);
if (rc >= 0) {
- path = ecryptfs_dentry_to_lower_path(file->f_path.dentry);
- touch_atime(path);
+ struct path path = ecryptfs_lower_path(file->f_path.dentry);
+ touch_atime(&path);
}
return rc;
}
@@ -59,12 +58,11 @@ static ssize_t ecryptfs_splice_read_update_atime(struct file *in, loff_t *ppos,
size_t len, unsigned int flags)
{
ssize_t rc;
- const struct path *path;
rc = filemap_splice_read(in, ppos, pipe, len, flags);
if (rc >= 0) {
- path = ecryptfs_dentry_to_lower_path(in->f_path.dentry);
- touch_atime(path);
+ struct path path = ecryptfs_lower_path(in->f_path.dentry);
+ touch_atime(&path);
}
return rc;
}
@@ -283,6 +281,7 @@ static int ecryptfs_dir_open(struct inode *inode, struct file *file)
* ecryptfs_lookup() */
struct ecryptfs_file_info *file_info;
struct file *lower_file;
+ struct path path;
/* Released in ecryptfs_release or end of function if failure */
file_info = kmem_cache_zalloc(ecryptfs_file_info_cache, GFP_KERNEL);
@@ -292,8 +291,8 @@ static int ecryptfs_dir_open(struct inode *inode, struct file *file)
"Error attempting to allocate memory\n");
return -ENOMEM;
}
- lower_file = dentry_open(ecryptfs_dentry_to_lower_path(ecryptfs_dentry),
- file->f_flags, current_cred());
+ path = ecryptfs_lower_path(ecryptfs_dentry);
+ lower_file = dentry_open(&path, file->f_flags, current_cred());
if (IS_ERR(lower_file)) {
printk(KERN_ERR "%s: Error attempting to initialize "
"the lower file for the dentry with name "
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index abd954c6a14e..ed1394da8d6b 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -327,24 +327,15 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode)
static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry,
struct dentry *lower_dentry)
{
- const struct path *path = ecryptfs_dentry_to_lower_path(dentry->d_parent);
+ struct dentry *lower_parent = ecryptfs_dentry_to_lower(dentry->d_parent);
struct inode *inode, *lower_inode;
- struct ecryptfs_dentry_info *dentry_info;
int rc = 0;
- dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
- if (!dentry_info) {
- dput(lower_dentry);
- return ERR_PTR(-ENOMEM);
- }
-
fsstack_copy_attr_atime(d_inode(dentry->d_parent),
- d_inode(path->dentry));
+ d_inode(lower_parent));
BUG_ON(!d_count(lower_dentry));
- ecryptfs_set_dentry_private(dentry, dentry_info);
- dentry_info->lower_path.mnt = mntget(path->mnt);
- dentry_info->lower_path.dentry = lower_dentry;
+ ecryptfs_set_dentry_lower(dentry, lower_dentry);
/*
* negative dentry can go positive under us here - its parent is not
@@ -1021,10 +1012,10 @@ static int ecryptfs_getattr(struct mnt_idmap *idmap,
{
struct dentry *dentry = path->dentry;
struct kstat lower_stat;
+ struct path lower_path = ecryptfs_lower_path(dentry);
int rc;
- rc = vfs_getattr_nosec(ecryptfs_dentry_to_lower_path(dentry),
- &lower_stat, request_mask, flags);
+ rc = vfs_getattr_nosec(&lower_path, &lower_stat, request_mask, flags);
if (!rc) {
fsstack_copy_attr_all(d_inode(dentry),
ecryptfs_inode_to_lower(d_inode(dentry)));
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index eab1beb846d3..16ea14dd2c62 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -106,15 +106,14 @@ static int ecryptfs_init_lower_file(struct dentry *dentry,
struct file **lower_file)
{
const struct cred *cred = current_cred();
- const struct path *path = ecryptfs_dentry_to_lower_path(dentry);
+ struct path path = ecryptfs_lower_path(dentry);
int rc;
- rc = ecryptfs_privileged_open(lower_file, path->dentry, path->mnt,
- cred);
+ rc = ecryptfs_privileged_open(lower_file, path.dentry, path.mnt, cred);
if (rc) {
printk(KERN_ERR "Error opening lower file "
"for lower_dentry [0x%p] and lower_mnt [0x%p]; "
- "rc = [%d]\n", path->dentry, path->mnt, rc);
+ "rc = [%d]\n", path.dentry, path.mnt, rc);
(*lower_file) = NULL;
}
return rc;
@@ -437,7 +436,6 @@ static int ecryptfs_get_tree(struct fs_context *fc)
struct ecryptfs_fs_context *ctx = fc->fs_private;
struct ecryptfs_sb_info *sbi = fc->s_fs_info;
struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
- struct ecryptfs_dentry_info *root_info;
const char *err = "Getting sb failed";
struct inode *inode;
struct path path;
@@ -543,14 +541,8 @@ static int ecryptfs_get_tree(struct fs_context *fc)
goto out_free;
}
- rc = -ENOMEM;
- root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
- if (!root_info)
- goto out_free;
-
- /* ->kill_sb() will take care of root_info */
- ecryptfs_set_dentry_private(s->s_root, root_info);
- root_info->lower_path = path;
+ ecryptfs_set_dentry_lower(s->s_root, path.dentry);
+ ecryptfs_superblock_to_private(s)->lower_mnt = path.mnt;
s->s_flags |= SB_ACTIVE;
fc->root = dget(s->s_root);
@@ -580,6 +572,7 @@ static void ecryptfs_kill_block_super(struct super_block *sb)
kill_anon_super(sb);
if (!sb_info)
return;
+ mntput(sb_info->lower_mnt);
ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
}
@@ -668,11 +661,6 @@ static struct ecryptfs_cache_info {
.size = sizeof(struct ecryptfs_file_info),
},
{
- .cache = &ecryptfs_dentry_info_cache,
- .name = "ecryptfs_dentry_info_cache",
- .size = sizeof(struct ecryptfs_dentry_info),
- },
- {
.cache = &ecryptfs_inode_info_cache,
.name = "ecryptfs_inode_cache",
.size = sizeof(struct ecryptfs_inode_info),
diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c
index cc01556c9d9b..2d2d510f2372 100644
--- a/fs/exfat/balloc.c
+++ b/fs/exfat/balloc.c
@@ -7,6 +7,7 @@
#include <linux/slab.h>
#include <linux/bitmap.h>
#include <linux/buffer_head.h>
+#include <linux/backing-dev.h>
#include "exfat_raw.h"
#include "exfat_fs.h"
@@ -26,13 +27,58 @@
/*
* Allocation Bitmap Management Functions
*/
+static bool exfat_test_bitmap_range(struct super_block *sb, unsigned int clu,
+ unsigned int count)
+{
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ unsigned int start = clu;
+ unsigned int end = clu + count;
+ unsigned int ent_idx, i, b;
+ unsigned int bit_offset, bits_to_check;
+ __le_long *bitmap_le;
+ unsigned long mask, word;
+
+ if (!is_valid_cluster(sbi, start) || !is_valid_cluster(sbi, end - 1))
+ return false;
+
+ while (start < end) {
+ ent_idx = CLUSTER_TO_BITMAP_ENT(start);
+ i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx);
+ b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx);
+
+ bitmap_le = (__le_long *)sbi->vol_amap[i]->b_data;
+
+ /* Calculate how many bits we can check in the current word */
+ bit_offset = b % BITS_PER_LONG;
+ bits_to_check = min(end - start,
+ (unsigned int)(BITS_PER_LONG - bit_offset));
+
+ /* Create a bitmask for the range of bits to check */
+ if (bits_to_check >= BITS_PER_LONG)
+ mask = ~0UL;
+ else
+ mask = ((1UL << bits_to_check) - 1) << bit_offset;
+ word = lel_to_cpu(bitmap_le[b / BITS_PER_LONG]);
+
+ /* Check if all bits in the mask are set */
+ if ((word & mask) != mask)
+ return false;
+
+ start += bits_to_check;
+ }
+
+ return true;
+}
+
static int exfat_allocate_bitmap(struct super_block *sb,
struct exfat_dentry *ep)
{
struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct blk_plug plug;
long long map_size;
- unsigned int i, need_map_size;
+ unsigned int i, j, need_map_size;
sector_t sector;
+ unsigned int max_ra_count;
sbi->map_clu = le32_to_cpu(ep->dentry.bitmap.start_clu);
map_size = le64_to_cpu(ep->dentry.bitmap.size);
@@ -56,22 +102,37 @@ static int exfat_allocate_bitmap(struct super_block *sb,
return -ENOMEM;
sector = exfat_cluster_to_sector(sbi, sbi->map_clu);
+ max_ra_count = min(sb->s_bdi->ra_pages, sb->s_bdi->io_pages) <<
+ (PAGE_SHIFT - sb->s_blocksize_bits);
for (i = 0; i < sbi->map_sectors; i++) {
- sbi->vol_amap[i] = sb_bread(sb, sector + i);
- if (!sbi->vol_amap[i]) {
- /* release all buffers and free vol_amap */
- int j = 0;
-
- while (j < i)
- brelse(sbi->vol_amap[j++]);
-
- kvfree(sbi->vol_amap);
- sbi->vol_amap = NULL;
- return -EIO;
+ /* Trigger the next readahead in advance. */
+ if (0 == (i % max_ra_count)) {
+ blk_start_plug(&plug);
+ for (j = i; j < min(max_ra_count, sbi->map_sectors - i) + i; j++)
+ sb_breadahead(sb, sector + j);
+ blk_finish_plug(&plug);
}
+
+ sbi->vol_amap[i] = sb_bread(sb, sector + i);
+ if (!sbi->vol_amap[i])
+ goto err_out;
}
+ if (exfat_test_bitmap_range(sb, sbi->map_clu,
+ EXFAT_B_TO_CLU_ROUND_UP(map_size, sbi)) == false)
+ goto err_out;
+
return 0;
+
+err_out:
+ j = 0;
+ /* release all buffers and free vol_amap */
+ while (j < i)
+ brelse(sbi->vol_amap[j++]);
+
+ kvfree(sbi->vol_amap);
+ sbi->vol_amap = NULL;
+ return -EIO;
}
int exfat_load_bitmap(struct super_block *sb)
diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index ee060e26f51d..7229146fe2bf 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -1244,3 +1244,163 @@ int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir)
return count;
}
+
+static int exfat_get_volume_label_dentry(struct super_block *sb,
+ struct exfat_entry_set_cache *es)
+{
+ int i;
+ int dentry = 0;
+ unsigned int type;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct exfat_hint_femp hint_femp;
+ struct exfat_inode_info *ei = EXFAT_I(sb->s_root->d_inode);
+ struct exfat_chain clu;
+ struct exfat_dentry *ep;
+ struct buffer_head *bh;
+
+ hint_femp.eidx = EXFAT_HINT_NONE;
+ exfat_chain_set(&clu, sbi->root_dir, 0, ALLOC_FAT_CHAIN);
+
+ while (clu.dir != EXFAT_EOF_CLUSTER) {
+ for (i = 0; i < sbi->dentries_per_clu; i++, dentry++) {
+ ep = exfat_get_dentry(sb, &clu, i, &bh);
+ if (!ep)
+ return -EIO;
+
+ type = exfat_get_entry_type(ep);
+ if (hint_femp.eidx == EXFAT_HINT_NONE) {
+ if (type == TYPE_DELETED || type == TYPE_UNUSED) {
+ hint_femp.cur = clu;
+ hint_femp.eidx = dentry;
+ hint_femp.count = 1;
+ }
+ }
+
+ if (type == TYPE_UNUSED) {
+ brelse(bh);
+ goto not_found;
+ }
+
+ if (type != TYPE_VOLUME) {
+ brelse(bh);
+ continue;
+ }
+
+ memset(es, 0, sizeof(*es));
+ es->sb = sb;
+ es->bh = es->__bh;
+ es->bh[0] = bh;
+ es->num_bh = 1;
+ es->start_off = EXFAT_DEN_TO_B(i) % sb->s_blocksize;
+
+ return 0;
+ }
+
+ if (exfat_get_next_cluster(sb, &(clu.dir)))
+ return -EIO;
+ }
+
+not_found:
+ if (hint_femp.eidx == EXFAT_HINT_NONE) {
+ hint_femp.cur.dir = EXFAT_EOF_CLUSTER;
+ hint_femp.eidx = dentry;
+ hint_femp.count = 0;
+ }
+
+ ei->hint_femp = hint_femp;
+
+ return -ENOENT;
+}
+
+int exfat_read_volume_label(struct super_block *sb, struct exfat_uni_name *label_out)
+{
+ int ret, i;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct exfat_entry_set_cache es;
+ struct exfat_dentry *ep;
+
+ mutex_lock(&sbi->s_lock);
+
+ memset(label_out, 0, sizeof(*label_out));
+ ret = exfat_get_volume_label_dentry(sb, &es);
+ if (ret < 0) {
+ /*
+ * ENOENT signifies that a volume label dentry doesn't exist
+ * We will treat this as an empty volume label and not fail.
+ */
+ if (ret == -ENOENT)
+ ret = 0;
+
+ goto unlock;
+ }
+
+ ep = exfat_get_dentry_cached(&es, 0);
+ label_out->name_len = ep->dentry.volume_label.char_count;
+ if (label_out->name_len > EXFAT_VOLUME_LABEL_LEN) {
+ ret = -EIO;
+ exfat_put_dentry_set(&es, false);
+ goto unlock;
+ }
+
+ for (i = 0; i < label_out->name_len; i++)
+ label_out->name[i] = le16_to_cpu(ep->dentry.volume_label.volume_label[i]);
+
+ exfat_put_dentry_set(&es, false);
+unlock:
+ mutex_unlock(&sbi->s_lock);
+ return ret;
+}
+
+int exfat_write_volume_label(struct super_block *sb,
+ struct exfat_uni_name *label)
+{
+ int ret, i;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct inode *root_inode = sb->s_root->d_inode;
+ struct exfat_entry_set_cache es;
+ struct exfat_chain clu;
+ struct exfat_dentry *ep;
+
+ if (label->name_len > EXFAT_VOLUME_LABEL_LEN)
+ return -EINVAL;
+
+ mutex_lock(&sbi->s_lock);
+
+ ret = exfat_get_volume_label_dentry(sb, &es);
+ if (ret == -ENOENT) {
+ if (label->name_len == 0) {
+ /* No volume label dentry, no need to clear */
+ ret = 0;
+ goto unlock;
+ }
+
+ ret = exfat_find_empty_entry(root_inode, &clu, 1, &es);
+ }
+
+ if (ret < 0)
+ goto unlock;
+
+ ep = exfat_get_dentry_cached(&es, 0);
+
+ if (label->name_len == 0 && ep->dentry.volume_label.char_count == 0) {
+ /* volume label had been cleared */
+ exfat_put_dentry_set(&es, 0);
+ goto unlock;
+ }
+
+ memset(ep, 0, sizeof(*ep));
+ ep->type = EXFAT_VOLUME;
+
+ for (i = 0; i < label->name_len; i++)
+ ep->dentry.volume_label.volume_label[i] =
+ cpu_to_le16(label->name[i]);
+
+ ep->dentry.volume_label.char_count = label->name_len;
+ es.modified = true;
+
+ ret = exfat_put_dentry_set(&es, IS_DIRSYNC(root_inode));
+
+unlock:
+ mutex_unlock(&sbi->s_lock);
+ return ret;
+}
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index f8ead4d47ef0..329697c89d09 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -477,6 +477,9 @@ int exfat_force_shutdown(struct super_block *sb, u32 flags);
/* namei.c */
extern const struct dentry_operations exfat_dentry_ops;
extern const struct dentry_operations exfat_utf8_dentry_ops;
+int exfat_find_empty_entry(struct inode *inode,
+ struct exfat_chain *p_dir, int num_entries,
+ struct exfat_entry_set_cache *es);
/* cache.c */
int exfat_cache_init(void);
@@ -517,6 +520,10 @@ int exfat_get_empty_dentry_set(struct exfat_entry_set_cache *es,
unsigned int num_entries);
int exfat_put_dentry_set(struct exfat_entry_set_cache *es, int sync);
int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir);
+int exfat_read_volume_label(struct super_block *sb,
+ struct exfat_uni_name *label_out);
+int exfat_write_volume_label(struct super_block *sb,
+ struct exfat_uni_name *label);
/* inode.c */
extern const struct inode_operations exfat_file_inode_operations;
diff --git a/fs/exfat/exfat_raw.h b/fs/exfat/exfat_raw.h
index 971a1ccd0e89..4082fa7b8c14 100644
--- a/fs/exfat/exfat_raw.h
+++ b/fs/exfat/exfat_raw.h
@@ -80,6 +80,7 @@
#define BOOTSEC_OLDBPB_LEN 53
#define EXFAT_FILE_NAME_LEN 15
+#define EXFAT_VOLUME_LABEL_LEN 11
#define EXFAT_MIN_SECT_SIZE_BITS 9
#define EXFAT_MAX_SECT_SIZE_BITS 12
@@ -160,6 +161,11 @@ struct exfat_dentry {
__le64 size;
} __packed upcase; /* up-case table directory entry */
struct {
+ __u8 char_count;
+ __le16 volume_label[EXFAT_VOLUME_LABEL_LEN];
+ __u8 reserved[8];
+ } __packed volume_label; /* volume label directory entry */
+ struct {
__u8 flags;
__u8 vendor_guid[16];
__u8 vendor_defined[14];
diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c
index 232cc7f8ab92..825083634ba2 100644
--- a/fs/exfat/fatent.c
+++ b/fs/exfat/fatent.c
@@ -89,35 +89,36 @@ int exfat_ent_get(struct super_block *sb, unsigned int loc,
int err;
if (!is_valid_cluster(sbi, loc)) {
- exfat_fs_error(sb, "invalid access to FAT (entry 0x%08x)",
+ exfat_fs_error_ratelimit(sb,
+ "invalid access to FAT (entry 0x%08x)",
loc);
return -EIO;
}
err = __exfat_ent_get(sb, loc, content);
if (err) {
- exfat_fs_error(sb,
+ exfat_fs_error_ratelimit(sb,
"failed to access to FAT (entry 0x%08x, err:%d)",
loc, err);
return err;
}
if (*content == EXFAT_FREE_CLUSTER) {
- exfat_fs_error(sb,
+ exfat_fs_error_ratelimit(sb,
"invalid access to FAT free cluster (entry 0x%08x)",
loc);
return -EIO;
}
if (*content == EXFAT_BAD_CLUSTER) {
- exfat_fs_error(sb,
+ exfat_fs_error_ratelimit(sb,
"invalid access to FAT bad cluster (entry 0x%08x)",
loc);
return -EIO;
}
if (*content != EXFAT_EOF_CLUSTER && !is_valid_cluster(sbi, *content)) {
- exfat_fs_error(sb,
+ exfat_fs_error_ratelimit(sb,
"invalid access to FAT (entry 0x%08x) bogus content (0x%08x)",
loc, *content);
return -EIO;
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index 538d2b6ac2ec..f246cf439588 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -486,6 +486,54 @@ static int exfat_ioctl_shutdown(struct super_block *sb, unsigned long arg)
return exfat_force_shutdown(sb, flags);
}
+static int exfat_ioctl_get_volume_label(struct super_block *sb, unsigned long arg)
+{
+ int ret;
+ char label[FSLABEL_MAX] = {0};
+ struct exfat_uni_name uniname;
+
+ ret = exfat_read_volume_label(sb, &uniname);
+ if (ret < 0)
+ return ret;
+
+ ret = exfat_utf16_to_nls(sb, &uniname, label, uniname.name_len);
+ if (ret < 0)
+ return ret;
+
+ if (copy_to_user((char __user *)arg, label, ret + 1))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int exfat_ioctl_set_volume_label(struct super_block *sb,
+ unsigned long arg)
+{
+ int ret = 0, lossy;
+ char label[FSLABEL_MAX];
+ struct exfat_uni_name uniname;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(label, (char __user *)arg, FSLABEL_MAX))
+ return -EFAULT;
+
+ memset(&uniname, 0, sizeof(uniname));
+ if (label[0]) {
+ ret = exfat_nls_to_utf16(sb, label, FSLABEL_MAX,
+ &uniname, &lossy);
+ if (ret < 0)
+ return ret;
+ else if (lossy & NLS_NAME_LOSSY)
+ return -EINVAL;
+ }
+
+ uniname.name_len = ret;
+
+ return exfat_write_volume_label(sb, &uniname);
+}
+
long exfat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -500,6 +548,10 @@ long exfat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return exfat_ioctl_shutdown(inode->i_sb, arg);
case FITRIM:
return exfat_ioctl_fitrim(inode, arg);
+ case FS_IOC_GETFSLABEL:
+ return exfat_ioctl_get_volume_label(inode->i_sb, arg);
+ case FS_IOC_SETFSLABEL:
+ return exfat_ioctl_set_volume_label(inode->i_sb, arg);
default:
return -ENOTTY;
}
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index c10844e1e16c..f9501c3a3666 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -25,7 +25,7 @@ int __exfat_write_inode(struct inode *inode, int sync)
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(inode);
- bool is_dir = (ei->type == TYPE_DIR) ? true : false;
+ bool is_dir = (ei->type == TYPE_DIR);
struct timespec64 ts;
if (inode->i_ino == EXFAT_ROOT_INO)
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index f5f1c4e8a29f..7eb9c67fd35f 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -300,7 +300,7 @@ static int exfat_check_max_dentries(struct inode *inode)
* the directory entry index in p_dir is returned on succeeds
* -error code is returned on failure
*/
-static int exfat_find_empty_entry(struct inode *inode,
+int exfat_find_empty_entry(struct inode *inode,
struct exfat_chain *p_dir, int num_entries,
struct exfat_entry_set_cache *es)
{
@@ -587,7 +587,7 @@ unlock:
}
/* lookup a file */
-static int exfat_find(struct inode *dir, struct qstr *qname,
+static int exfat_find(struct inode *dir, const struct qstr *qname,
struct exfat_dir_entry *info)
{
int ret, dentry, count;
diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c
index 1729bf42eb51..8243d94ceaf4 100644
--- a/fs/exfat/nls.c
+++ b/fs/exfat/nls.c
@@ -789,7 +789,7 @@ int exfat_create_upcase_table(struct super_block *sb)
return ret;
}
- if (exfat_get_next_cluster(sb, &(clu.dir)))
+ if (exfat_get_next_cluster(sb, &clu.dir))
return -EIO;
}
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index 8926e63f5bb7..7f9592856bf7 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -31,6 +31,16 @@ static void exfat_free_iocharset(struct exfat_sb_info *sbi)
kfree(sbi->options.iocharset);
}
+static void exfat_set_iocharset(struct exfat_mount_options *opts,
+ char *iocharset)
+{
+ opts->iocharset = iocharset;
+ if (!strcmp(opts->iocharset, "utf8"))
+ opts->utf8 = 1;
+ else
+ opts->utf8 = 0;
+}
+
static void exfat_put_super(struct super_block *sb)
{
struct exfat_sb_info *sbi = EXFAT_SB(sb);
@@ -243,11 +253,11 @@ static const struct fs_parameter_spec exfat_parameters[] = {
fsparam_u32oct("allow_utime", Opt_allow_utime),
fsparam_string("iocharset", Opt_charset),
fsparam_enum("errors", Opt_errors, exfat_param_enums),
- fsparam_flag("discard", Opt_discard),
+ fsparam_flag_no("discard", Opt_discard),
fsparam_flag("keep_last_dots", Opt_keep_last_dots),
fsparam_flag("sys_tz", Opt_sys_tz),
fsparam_s32("time_offset", Opt_time_offset),
- fsparam_flag("zero_size_dir", Opt_zero_size_dir),
+ fsparam_flag_no("zero_size_dir", Opt_zero_size_dir),
__fsparam(NULL, "utf8", Opt_utf8, fs_param_deprecated,
NULL),
__fsparam(NULL, "debug", Opt_debug, fs_param_deprecated,
@@ -292,14 +302,14 @@ static int exfat_parse_param(struct fs_context *fc, struct fs_parameter *param)
break;
case Opt_charset:
exfat_free_iocharset(sbi);
- opts->iocharset = param->string;
+ exfat_set_iocharset(opts, param->string);
param->string = NULL;
break;
case Opt_errors:
opts->errors = result.uint_32;
break;
case Opt_discard:
- opts->discard = 1;
+ opts->discard = !result.negated;
break;
case Opt_keep_last_dots:
opts->keep_last_dots = 1;
@@ -317,7 +327,7 @@ static int exfat_parse_param(struct fs_context *fc, struct fs_parameter *param)
opts->time_offset = result.int_32;
break;
case Opt_zero_size_dir:
- opts->zero_size_dir = true;
+ opts->zero_size_dir = !result.negated;
break;
case Opt_utf8:
case Opt_debug:
@@ -664,8 +674,8 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
/* set up enough so that it can read an inode */
exfat_hash_init(sb);
- if (!strcmp(sbi->options.iocharset, "utf8"))
- opts->utf8 = 1;
+ if (sbi->options.utf8)
+ set_default_d_op(sb, &exfat_utf8_dentry_ops);
else {
sbi->nls_io = load_nls(sbi->options.iocharset);
if (!sbi->nls_io) {
@@ -674,12 +684,8 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
err = -EINVAL;
goto free_table;
}
- }
-
- if (sbi->options.utf8)
- set_default_d_op(sb, &exfat_utf8_dentry_ops);
- else
set_default_d_op(sb, &exfat_dentry_ops);
+ }
root_inode = new_inode(sb);
if (!root_inode) {
@@ -742,12 +748,44 @@ static void exfat_free(struct fs_context *fc)
static int exfat_reconfigure(struct fs_context *fc)
{
struct super_block *sb = fc->root->d_sb;
+ struct exfat_sb_info *remount_sbi = fc->s_fs_info;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct exfat_mount_options *new_opts = &remount_sbi->options;
+ struct exfat_mount_options *cur_opts = &sbi->options;
+
fc->sb_flags |= SB_NODIRATIME;
sync_filesystem(sb);
- mutex_lock(&EXFAT_SB(sb)->s_lock);
+ mutex_lock(&sbi->s_lock);
exfat_clear_volume_dirty(sb);
- mutex_unlock(&EXFAT_SB(sb)->s_lock);
+ mutex_unlock(&sbi->s_lock);
+
+ if (new_opts->allow_utime == (unsigned short)-1)
+ new_opts->allow_utime = ~new_opts->fs_dmask & 0022;
+
+ /*
+ * Since the old settings of these mount options are cached in
+ * inodes or dentries, they cannot be modified dynamically.
+ */
+ if (strcmp(new_opts->iocharset, cur_opts->iocharset) ||
+ new_opts->keep_last_dots != cur_opts->keep_last_dots ||
+ new_opts->sys_tz != cur_opts->sys_tz ||
+ new_opts->time_offset != cur_opts->time_offset ||
+ !uid_eq(new_opts->fs_uid, cur_opts->fs_uid) ||
+ !gid_eq(new_opts->fs_gid, cur_opts->fs_gid) ||
+ new_opts->fs_fmask != cur_opts->fs_fmask ||
+ new_opts->fs_dmask != cur_opts->fs_dmask ||
+ new_opts->allow_utime != cur_opts->allow_utime)
+ return -EINVAL;
+
+ if (new_opts->discard != cur_opts->discard &&
+ new_opts->discard &&
+ !bdev_max_discard_sectors(sb->s_bdev)) {
+ exfat_warn(sb, "remounting with \"discard\" option, but the device does not support discard");
+ return -EINVAL;
+ }
+
+ swap(*cur_opts, *new_opts);
return 0;
}
@@ -777,8 +815,8 @@ static int exfat_init_fs_context(struct fs_context *fc)
sbi->options.fs_fmask = current->fs->umask;
sbi->options.fs_dmask = current->fs->umask;
sbi->options.allow_utime = -1;
- sbi->options.iocharset = exfat_default_iocharset;
sbi->options.errors = EXFAT_ERRORS_RO;
+ exfat_set_iocharset(&sbi->options, exfat_default_iocharset);
fc->s_fs_info = sbi;
fc->ops = &exfat_context_ops;
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index c9ca41d91a6c..01873c2a34ad 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -1,31 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
-# Ext3 configs are here for backward compatibility with old configs which may
-# have EXT3_FS set but not EXT4_FS set and thus would result in non-bootable
-# kernels after the removal of ext3 driver.
-config EXT3_FS
- tristate "The Extended 3 (ext3) filesystem"
- select EXT4_FS
- help
- This config option is here only for backward compatibility. ext3
- filesystem is now handled by the ext4 driver.
-
-config EXT3_FS_POSIX_ACL
- bool "Ext3 POSIX Access Control Lists"
- depends on EXT3_FS
- select EXT4_FS_POSIX_ACL
- select FS_POSIX_ACL
- help
- This config option is here only for backward compatibility. ext3
- filesystem is now handled by the ext4 driver.
-
-config EXT3_FS_SECURITY
- bool "Ext3 Security Labels"
- depends on EXT3_FS
- select EXT4_FS_SECURITY
- help
- This config option is here only for backward compatibility. ext3
- filesystem is now handled by the ext4 driver.
-
config EXT4_FS
tristate "The Extended 4 (ext4) filesystem"
select BUFFER_HEAD
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6cb784a56b3b..57087da6c7be 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1450,7 +1450,9 @@ struct ext4_super_block {
__le16 s_encoding; /* Filename charset encoding */
__le16 s_encoding_flags; /* Filename charset encoding flags */
__le32 s_orphan_file_inum; /* Inode for tracking orphan inodes */
- __le32 s_reserved[94]; /* Padding to the end of the block */
+ __le16 s_def_resuid_hi;
+ __le16 s_def_resgid_hi;
+ __le32 s_reserved[93]; /* Padding to the end of the block */
__le32 s_checksum; /* crc32c(superblock) */
};
@@ -1820,6 +1822,18 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
}
+static inline int ext4_get_resuid(struct ext4_super_block *es)
+{
+ return le16_to_cpu(es->s_def_resuid) |
+ le16_to_cpu(es->s_def_resuid_hi) << 16;
+}
+
+static inline int ext4_get_resgid(struct ext4_super_block *es)
+{
+ return le16_to_cpu(es->s_def_resgid) |
+ le16_to_cpu(es->s_def_resgid_hi) << 16;
+}
+
/*
* Returns: sbi->field[index]
* Used to access an array element from the following sbi fields which require
@@ -1990,6 +2004,16 @@ static inline bool ext4_verity_in_progress(struct inode *inode)
#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime
/*
+ * Check whether the inode is tracked as orphan (either in orphan file or
+ * orphan list).
+ */
+static inline bool ext4_inode_orphan_tracked(struct inode *inode)
+{
+ return ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) ||
+ !list_empty(&EXT4_I(inode)->i_orphan);
+}
+
+/*
* Codes for operating systems
*/
#define EXT4_OS_LINUX 0
@@ -3142,6 +3166,8 @@ extern struct buffer_head *ext4_sb_bread(struct super_block *sb,
sector_t block, blk_opf_t op_flags);
extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
sector_t block);
+extern struct buffer_head *ext4_sb_bread_nofail(struct super_block *sb,
+ sector_t block);
extern void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
bh_end_io_t *end_io, bool simu_fail);
extern int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 42bee1d4f9f9..fa66b08de999 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -663,7 +663,7 @@ void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t star
static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
{
- blk_opf_t write_flags = REQ_SYNC;
+ blk_opf_t write_flags = JBD2_JOURNAL_REQ_FLAGS;
struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
/* Add REQ_FUA | REQ_PREFLUSH only its tail */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 93240e35ee36..7a8b30932189 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -354,7 +354,7 @@ static void ext4_inode_extension_cleanup(struct inode *inode, bool need_trunc)
* to cleanup the orphan list in ext4_handle_inode_extension(). Do it
* now.
*/
- if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
+ if (ext4_inode_orphan_tracked(inode) && inode->i_nlink) {
handle_t *handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle)) {
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index 91185c40f755..22fc333244ef 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -74,7 +74,8 @@ static int ext4_getfsmap_dev_compare(const void *p1, const void *p2)
static bool ext4_getfsmap_rec_before_low_key(struct ext4_getfsmap_info *info,
struct ext4_fsmap *rec)
{
- return rec->fmr_physical < info->gfi_low.fmr_physical;
+ return rec->fmr_physical + rec->fmr_length <=
+ info->gfi_low.fmr_physical;
}
/*
@@ -200,15 +201,18 @@ static int ext4_getfsmap_meta_helper(struct super_block *sb,
ext4_group_first_block_no(sb, agno));
fs_end = fs_start + EXT4_C2B(sbi, len);
- /* Return relevant extents from the meta_list */
+ /*
+ * Return relevant extents from the meta_list. We emit all extents that
+ * partially/fully overlap with the query range
+ */
list_for_each_entry_safe(p, tmp, &info->gfi_meta_list, fmr_list) {
- if (p->fmr_physical < info->gfi_next_fsblk) {
+ if (p->fmr_physical + p->fmr_length <= info->gfi_next_fsblk) {
list_del(&p->fmr_list);
kfree(p);
continue;
}
- if (p->fmr_physical <= fs_start ||
- p->fmr_physical + p->fmr_length <= fs_end) {
+ if (p->fmr_physical <= fs_end &&
+ p->fmr_physical + p->fmr_length > fs_start) {
/* Emit the retained free extent record if present */
if (info->gfi_lastfree.fmr_owner) {
error = ext4_getfsmap_helper(sb, info,
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index d45124318200..da76353b3a57 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -1025,7 +1025,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
}
/* Go read the buffer for the next level down */
- bh = ext4_sb_bread(inode->i_sb, nr, 0);
+ bh = ext4_sb_bread_nofail(inode->i_sb, nr);
/*
* A read failure? Report error and clear slot
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5b7a15db4953..f9e4ac87211e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3872,47 +3872,12 @@ static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset,
return ret;
}
-static inline bool ext4_want_directio_fallback(unsigned flags, ssize_t written)
-{
- /* must be a directio to fall back to buffered */
- if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) !=
- (IOMAP_WRITE | IOMAP_DIRECT))
- return false;
-
- /* atomic writes are all-or-nothing */
- if (flags & IOMAP_ATOMIC)
- return false;
-
- /* can only try again if we wrote nothing */
- return written == 0;
-}
-
-static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
- ssize_t written, unsigned flags, struct iomap *iomap)
-{
- /*
- * Check to see whether an error occurred while writing out the data to
- * the allocated blocks. If so, return the magic error code for
- * non-atomic write so that we fallback to buffered I/O and attempt to
- * complete the remainder of the I/O.
- * For non-atomic writes, any blocks that may have been
- * allocated in preparation for the direct I/O will be reused during
- * buffered I/O. For atomic write, we never fallback to buffered-io.
- */
- if (ext4_want_directio_fallback(flags, written))
- return -ENOTBLK;
-
- return 0;
-}
-
const struct iomap_ops ext4_iomap_ops = {
.iomap_begin = ext4_iomap_begin,
- .iomap_end = ext4_iomap_end,
};
const struct iomap_ops ext4_iomap_overwrite_ops = {
.iomap_begin = ext4_iomap_overwrite_begin,
- .iomap_end = ext4_iomap_end,
};
static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
@@ -4287,7 +4252,11 @@ int ext4_can_truncate(struct inode *inode)
* We have to make sure i_disksize gets properly updated before we truncate
* page cache due to hole punching or zero range. Otherwise i_disksize update
* can get lost as it may have been postponed to submission of writeback but
- * that will never happen after we truncate page cache.
+ * that will never happen if we remove the folio containing i_size from the
+ * page cache. Also if we punch hole within i_size but above i_disksize,
+ * following ext4_page_mkwrite() may mistakenly allocate written blocks over
+ * the hole and thus introduce allocated blocks beyond i_disksize which is
+ * not allowed (e2fsck would complain in case of crash).
*/
int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
loff_t len)
@@ -4298,9 +4267,11 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
loff_t size = i_size_read(inode);
WARN_ON(!inode_is_locked(inode));
- if (offset > size || offset + len < size)
+ if (offset > size)
return 0;
+ if (offset + len < size)
+ size = offset + len;
if (EXT4_I(inode)->i_disksize >= size)
return 0;
@@ -4748,7 +4719,7 @@ static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode
* old inodes get re-used with the upper 16 bits of the
* uid/gid intact.
*/
- if (ei->i_dtime && list_empty(&ei->i_orphan)) {
+ if (ei->i_dtime && !ext4_inode_orphan_tracked(inode)) {
raw_inode->i_uid_high = 0;
raw_inode->i_gid_high = 0;
} else {
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 84e3c73952d7..a93a7baae990 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -27,14 +27,16 @@
#include "fsmap.h"
#include <trace/events/ext4.h>
-typedef void ext4_update_sb_callback(struct ext4_super_block *es,
- const void *arg);
+typedef void ext4_update_sb_callback(struct ext4_sb_info *sbi,
+ struct ext4_super_block *es,
+ const void *arg);
/*
* Superblock modification callback function for changing file system
* label
*/
-static void ext4_sb_setlabel(struct ext4_super_block *es, const void *arg)
+static void ext4_sb_setlabel(struct ext4_sb_info *sbi,
+ struct ext4_super_block *es, const void *arg)
{
/* Sanity check, this should never happen */
BUILD_BUG_ON(sizeof(es->s_volume_name) < EXT4_LABEL_MAX);
@@ -46,7 +48,8 @@ static void ext4_sb_setlabel(struct ext4_super_block *es, const void *arg)
* Superblock modification callback function for changing file system
* UUID.
*/
-static void ext4_sb_setuuid(struct ext4_super_block *es, const void *arg)
+static void ext4_sb_setuuid(struct ext4_sb_info *sbi,
+ struct ext4_super_block *es, const void *arg)
{
memcpy(es->s_uuid, (__u8 *)arg, UUID_SIZE);
}
@@ -71,7 +74,7 @@ int ext4_update_primary_sb(struct super_block *sb, handle_t *handle,
goto out_err;
lock_buffer(bh);
- func(es, arg);
+ func(sbi, es, arg);
ext4_superblock_csum_set(sb);
unlock_buffer(bh);
@@ -149,7 +152,7 @@ static int ext4_update_backup_sb(struct super_block *sb,
unlock_buffer(bh);
goto out_bh;
}
- func(es, arg);
+ func(EXT4_SB(sb), es, arg);
if (ext4_has_feature_metadata_csum(sb))
es->s_checksum = ext4_superblock_csum(es);
set_buffer_uptodate(bh);
@@ -1230,6 +1233,295 @@ static int ext4_ioctl_setuuid(struct file *filp,
return ret;
}
+
+#define TUNE_OPS_SUPPORTED (EXT4_TUNE_FL_ERRORS_BEHAVIOR | \
+ EXT4_TUNE_FL_MNT_COUNT | EXT4_TUNE_FL_MAX_MNT_COUNT | \
+ EXT4_TUNE_FL_CHECKINTRVAL | EXT4_TUNE_FL_LAST_CHECK_TIME | \
+ EXT4_TUNE_FL_RESERVED_BLOCKS | EXT4_TUNE_FL_RESERVED_UID | \
+ EXT4_TUNE_FL_RESERVED_GID | EXT4_TUNE_FL_DEFAULT_MNT_OPTS | \
+ EXT4_TUNE_FL_DEF_HASH_ALG | EXT4_TUNE_FL_RAID_STRIDE | \
+ EXT4_TUNE_FL_RAID_STRIPE_WIDTH | EXT4_TUNE_FL_MOUNT_OPTS | \
+ EXT4_TUNE_FL_FEATURES | EXT4_TUNE_FL_EDIT_FEATURES | \
+ EXT4_TUNE_FL_FORCE_FSCK | EXT4_TUNE_FL_ENCODING | \
+ EXT4_TUNE_FL_ENCODING_FLAGS)
+
+#define EXT4_TUNE_SET_COMPAT_SUPP \
+ (EXT4_FEATURE_COMPAT_DIR_INDEX | \
+ EXT4_FEATURE_COMPAT_STABLE_INODES)
+#define EXT4_TUNE_SET_INCOMPAT_SUPP \
+ (EXT4_FEATURE_INCOMPAT_EXTENTS | \
+ EXT4_FEATURE_INCOMPAT_EA_INODE | \
+ EXT4_FEATURE_INCOMPAT_ENCRYPT | \
+ EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
+ EXT4_FEATURE_INCOMPAT_LARGEDIR | \
+ EXT4_FEATURE_INCOMPAT_CASEFOLD)
+#define EXT4_TUNE_SET_RO_COMPAT_SUPP \
+ (EXT4_FEATURE_RO_COMPAT_LARGE_FILE | \
+ EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
+ EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
+ EXT4_FEATURE_RO_COMPAT_PROJECT | \
+ EXT4_FEATURE_RO_COMPAT_VERITY)
+
+#define EXT4_TUNE_CLEAR_COMPAT_SUPP (0)
+#define EXT4_TUNE_CLEAR_INCOMPAT_SUPP (0)
+#define EXT4_TUNE_CLEAR_RO_COMPAT_SUPP (0)
+
+#define SB_ENC_SUPP_MASK (SB_ENC_STRICT_MODE_FL | \
+ SB_ENC_NO_COMPAT_FALLBACK_FL)
+
+static int ext4_ioctl_get_tune_sb(struct ext4_sb_info *sbi,
+ struct ext4_tune_sb_params __user *params)
+{
+ struct ext4_tune_sb_params ret;
+ struct ext4_super_block *es = sbi->s_es;
+
+ memset(&ret, 0, sizeof(ret));
+ ret.set_flags = TUNE_OPS_SUPPORTED;
+ ret.errors_behavior = le16_to_cpu(es->s_errors);
+ ret.mnt_count = le16_to_cpu(es->s_mnt_count);
+ ret.max_mnt_count = le16_to_cpu(es->s_max_mnt_count);
+ ret.checkinterval = le32_to_cpu(es->s_checkinterval);
+ ret.last_check_time = le32_to_cpu(es->s_lastcheck);
+ ret.reserved_blocks = ext4_r_blocks_count(es);
+ ret.blocks_count = ext4_blocks_count(es);
+ ret.reserved_uid = ext4_get_resuid(es);
+ ret.reserved_gid = ext4_get_resgid(es);
+ ret.default_mnt_opts = le32_to_cpu(es->s_default_mount_opts);
+ ret.def_hash_alg = es->s_def_hash_version;
+ ret.raid_stride = le16_to_cpu(es->s_raid_stride);
+ ret.raid_stripe_width = le32_to_cpu(es->s_raid_stripe_width);
+ ret.encoding = le16_to_cpu(es->s_encoding);
+ ret.encoding_flags = le16_to_cpu(es->s_encoding_flags);
+ strscpy_pad(ret.mount_opts, es->s_mount_opts);
+ ret.feature_compat = le32_to_cpu(es->s_feature_compat);
+ ret.feature_incompat = le32_to_cpu(es->s_feature_incompat);
+ ret.feature_ro_compat = le32_to_cpu(es->s_feature_ro_compat);
+ ret.set_feature_compat_mask = EXT4_TUNE_SET_COMPAT_SUPP;
+ ret.set_feature_incompat_mask = EXT4_TUNE_SET_INCOMPAT_SUPP;
+ ret.set_feature_ro_compat_mask = EXT4_TUNE_SET_RO_COMPAT_SUPP;
+ ret.clear_feature_compat_mask = EXT4_TUNE_CLEAR_COMPAT_SUPP;
+ ret.clear_feature_incompat_mask = EXT4_TUNE_CLEAR_INCOMPAT_SUPP;
+ ret.clear_feature_ro_compat_mask = EXT4_TUNE_CLEAR_RO_COMPAT_SUPP;
+ if (copy_to_user(params, &ret, sizeof(ret)))
+ return -EFAULT;
+ return 0;
+}
+
+static void ext4_sb_setparams(struct ext4_sb_info *sbi,
+ struct ext4_super_block *es, const void *arg)
+{
+ const struct ext4_tune_sb_params *params = arg;
+
+ if (params->set_flags & EXT4_TUNE_FL_ERRORS_BEHAVIOR)
+ es->s_errors = cpu_to_le16(params->errors_behavior);
+ if (params->set_flags & EXT4_TUNE_FL_MNT_COUNT)
+ es->s_mnt_count = cpu_to_le16(params->mnt_count);
+ if (params->set_flags & EXT4_TUNE_FL_MAX_MNT_COUNT)
+ es->s_max_mnt_count = cpu_to_le16(params->max_mnt_count);
+ if (params->set_flags & EXT4_TUNE_FL_CHECKINTRVAL)
+ es->s_checkinterval = cpu_to_le32(params->checkinterval);
+ if (params->set_flags & EXT4_TUNE_FL_LAST_CHECK_TIME)
+ es->s_lastcheck = cpu_to_le32(params->last_check_time);
+ if (params->set_flags & EXT4_TUNE_FL_RESERVED_BLOCKS) {
+ ext4_fsblk_t blk = params->reserved_blocks;
+
+ es->s_r_blocks_count_lo = cpu_to_le32((u32)blk);
+ es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
+ }
+ if (params->set_flags & EXT4_TUNE_FL_RESERVED_UID) {
+ int uid = params->reserved_uid;
+
+ es->s_def_resuid = cpu_to_le16(uid & 0xFFFF);
+ es->s_def_resuid_hi = cpu_to_le16(uid >> 16);
+ }
+ if (params->set_flags & EXT4_TUNE_FL_RESERVED_GID) {
+ int gid = params->reserved_gid;
+
+ es->s_def_resgid = cpu_to_le16(gid & 0xFFFF);
+ es->s_def_resgid_hi = cpu_to_le16(gid >> 16);
+ }
+ if (params->set_flags & EXT4_TUNE_FL_DEFAULT_MNT_OPTS)
+ es->s_default_mount_opts = cpu_to_le32(params->default_mnt_opts);
+ if (params->set_flags & EXT4_TUNE_FL_DEF_HASH_ALG)
+ es->s_def_hash_version = params->def_hash_alg;
+ if (params->set_flags & EXT4_TUNE_FL_RAID_STRIDE)
+ es->s_raid_stride = cpu_to_le16(params->raid_stride);
+ if (params->set_flags & EXT4_TUNE_FL_RAID_STRIPE_WIDTH)
+ es->s_raid_stripe_width =
+ cpu_to_le32(params->raid_stripe_width);
+ if (params->set_flags & EXT4_TUNE_FL_ENCODING)
+ es->s_encoding = cpu_to_le16(params->encoding);
+ if (params->set_flags & EXT4_TUNE_FL_ENCODING_FLAGS)
+ es->s_encoding_flags = cpu_to_le16(params->encoding_flags);
+ strscpy_pad(es->s_mount_opts, params->mount_opts);
+ if (params->set_flags & EXT4_TUNE_FL_EDIT_FEATURES) {
+ es->s_feature_compat |=
+ cpu_to_le32(params->set_feature_compat_mask);
+ es->s_feature_incompat |=
+ cpu_to_le32(params->set_feature_incompat_mask);
+ es->s_feature_ro_compat |=
+ cpu_to_le32(params->set_feature_ro_compat_mask);
+ es->s_feature_compat &=
+ ~cpu_to_le32(params->clear_feature_compat_mask);
+ es->s_feature_incompat &=
+ ~cpu_to_le32(params->clear_feature_incompat_mask);
+ es->s_feature_ro_compat &=
+ ~cpu_to_le32(params->clear_feature_ro_compat_mask);
+ if (params->set_feature_compat_mask &
+ EXT4_FEATURE_COMPAT_DIR_INDEX)
+ es->s_def_hash_version = sbi->s_def_hash_version;
+ if (params->set_feature_incompat_mask &
+ EXT4_FEATURE_INCOMPAT_CSUM_SEED)
+ es->s_checksum_seed = cpu_to_le32(sbi->s_csum_seed);
+ }
+ if (params->set_flags & EXT4_TUNE_FL_FORCE_FSCK)
+ es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+}
+
+static int ext4_ioctl_set_tune_sb(struct file *filp,
+ struct ext4_tune_sb_params __user *in)
+{
+ struct ext4_tune_sb_params params;
+ struct super_block *sb = file_inode(filp)->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int enabling_casefold = 0;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&params, in, sizeof(params)))
+ return -EFAULT;
+
+ if ((params.set_flags & ~TUNE_OPS_SUPPORTED) != 0)
+ return -EOPNOTSUPP;
+
+ if ((params.set_flags & EXT4_TUNE_FL_ERRORS_BEHAVIOR) &&
+ (params.errors_behavior > EXT4_ERRORS_PANIC))
+ return -EINVAL;
+
+ if ((params.set_flags & EXT4_TUNE_FL_RESERVED_BLOCKS) &&
+ (params.reserved_blocks > ext4_blocks_count(sbi->s_es) / 2))
+ return -EINVAL;
+ if ((params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) &&
+ ((params.def_hash_alg > DX_HASH_LAST) ||
+ (params.def_hash_alg == DX_HASH_SIPHASH)))
+ return -EINVAL;
+ if ((params.set_flags & EXT4_TUNE_FL_FEATURES) &&
+ (params.set_flags & EXT4_TUNE_FL_EDIT_FEATURES))
+ return -EINVAL;
+
+ if (params.set_flags & EXT4_TUNE_FL_FEATURES) {
+ params.set_feature_compat_mask =
+ params.feature_compat &
+ ~le32_to_cpu(es->s_feature_compat);
+ params.set_feature_incompat_mask =
+ params.feature_incompat &
+ ~le32_to_cpu(es->s_feature_incompat);
+ params.set_feature_ro_compat_mask =
+ params.feature_ro_compat &
+ ~le32_to_cpu(es->s_feature_ro_compat);
+ params.clear_feature_compat_mask =
+ ~params.feature_compat &
+ le32_to_cpu(es->s_feature_compat);
+ params.clear_feature_incompat_mask =
+ ~params.feature_incompat &
+ le32_to_cpu(es->s_feature_incompat);
+ params.clear_feature_ro_compat_mask =
+ ~params.feature_ro_compat &
+ le32_to_cpu(es->s_feature_ro_compat);
+ params.set_flags |= EXT4_TUNE_FL_EDIT_FEATURES;
+ }
+ if (params.set_flags & EXT4_TUNE_FL_EDIT_FEATURES) {
+ if ((params.set_feature_compat_mask &
+ ~EXT4_TUNE_SET_COMPAT_SUPP) ||
+ (params.set_feature_incompat_mask &
+ ~EXT4_TUNE_SET_INCOMPAT_SUPP) ||
+ (params.set_feature_ro_compat_mask &
+ ~EXT4_TUNE_SET_RO_COMPAT_SUPP) ||
+ (params.clear_feature_compat_mask &
+ ~EXT4_TUNE_CLEAR_COMPAT_SUPP) ||
+ (params.clear_feature_incompat_mask &
+ ~EXT4_TUNE_CLEAR_INCOMPAT_SUPP) ||
+ (params.clear_feature_ro_compat_mask &
+ ~EXT4_TUNE_CLEAR_RO_COMPAT_SUPP))
+ return -EOPNOTSUPP;
+
+ /*
+ * Filter out the features that are already set from
+ * the set_mask.
+ */
+ params.set_feature_compat_mask &=
+ ~le32_to_cpu(es->s_feature_compat);
+ params.set_feature_incompat_mask &=
+ ~le32_to_cpu(es->s_feature_incompat);
+ params.set_feature_ro_compat_mask &=
+ ~le32_to_cpu(es->s_feature_ro_compat);
+ if ((params.set_feature_incompat_mask &
+ EXT4_FEATURE_INCOMPAT_CASEFOLD)) {
+ enabling_casefold = 1;
+ if (!(params.set_flags & EXT4_TUNE_FL_ENCODING)) {
+ params.encoding = EXT4_ENC_UTF8_12_1;
+ params.set_flags |= EXT4_TUNE_FL_ENCODING;
+ }
+ if (!(params.set_flags & EXT4_TUNE_FL_ENCODING_FLAGS)) {
+ params.encoding_flags = 0;
+ params.set_flags |= EXT4_TUNE_FL_ENCODING_FLAGS;
+ }
+ }
+ if ((params.set_feature_compat_mask &
+ EXT4_FEATURE_COMPAT_DIR_INDEX)) {
+ uuid_t uu;
+
+ memcpy(&uu, sbi->s_hash_seed, UUID_SIZE);
+ if (uuid_is_null(&uu))
+ generate_random_uuid((char *)
+ &sbi->s_hash_seed);
+ if (params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG)
+ sbi->s_def_hash_version = params.def_hash_alg;
+ else if (sbi->s_def_hash_version == 0)
+ sbi->s_def_hash_version = DX_HASH_HALF_MD4;
+ if (!(es->s_flags &
+ cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH)) &&
+ !(es->s_flags &
+ cpu_to_le32(EXT2_FLAGS_SIGNED_HASH))) {
+#ifdef __CHAR_UNSIGNED__
+ sbi->s_hash_unsigned = 3;
+#else
+ sbi->s_hash_unsigned = 0;
+#endif
+ }
+ }
+ }
+ if (params.set_flags & EXT4_TUNE_FL_ENCODING) {
+ if (!enabling_casefold)
+ return -EINVAL;
+ if (params.encoding == 0)
+ params.encoding = EXT4_ENC_UTF8_12_1;
+ else if (params.encoding != EXT4_ENC_UTF8_12_1)
+ return -EINVAL;
+ }
+ if (params.set_flags & EXT4_TUNE_FL_ENCODING_FLAGS) {
+ if (!enabling_casefold)
+ return -EINVAL;
+ if (params.encoding_flags & ~SB_ENC_SUPP_MASK)
+ return -EINVAL;
+ }
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ ret = ext4_update_superblocks_fn(sb, ext4_sb_setparams, &params);
+ mnt_drop_write_file(filp);
+
+ if (params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG)
+ sbi->s_def_hash_version = params.def_hash_alg;
+
+ return ret;
+}
+
static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -1616,6 +1908,11 @@ resizefs_out:
return ext4_ioctl_getuuid(EXT4_SB(sb), (void __user *)arg);
case EXT4_IOC_SETFSUUID:
return ext4_ioctl_setuuid(filp, (const void __user *)arg);
+ case EXT4_IOC_GET_TUNE_SB_PARAM:
+ return ext4_ioctl_get_tune_sb(EXT4_SB(sb),
+ (void __user *)arg);
+ case EXT4_IOC_SET_TUNE_SB_PARAM:
+ return ext4_ioctl_set_tune_sb(filp, (void __user *)arg);
default:
return -ENOTTY;
}
@@ -1703,7 +2000,8 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
}
#endif
-static void set_overhead(struct ext4_super_block *es, const void *arg)
+static void set_overhead(struct ext4_sb_info *sbi,
+ struct ext4_super_block *es, const void *arg)
{
es->s_overhead_clusters = cpu_to_le32(*((unsigned long *) arg));
}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 8b18802e83eb..9087183602e4 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3655,16 +3655,26 @@ static void ext4_discard_work(struct work_struct *work)
static inline void ext4_mb_avg_fragment_size_destroy(struct ext4_sb_info *sbi)
{
+ if (!sbi->s_mb_avg_fragment_size)
+ return;
+
for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++)
xa_destroy(&sbi->s_mb_avg_fragment_size[i]);
+
kfree(sbi->s_mb_avg_fragment_size);
+ sbi->s_mb_avg_fragment_size = NULL;
}
static inline void ext4_mb_largest_free_orders_destroy(struct ext4_sb_info *sbi)
{
+ if (!sbi->s_mb_largest_free_orders)
+ return;
+
for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++)
xa_destroy(&sbi->s_mb_largest_free_orders[i]);
+
kfree(sbi->s_mb_largest_free_orders);
+ sbi->s_mb_largest_free_orders = NULL;
}
int ext4_mb_init(struct super_block *sb)
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 51661570cf3b..ab1ff51302fb 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -231,9 +231,9 @@ static int kmmpd(void *data)
* Adjust the mmp_check_interval depending on how much time
* it took for the MMP block to be written.
*/
- mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ,
- EXT4_MMP_MAX_CHECK_INTERVAL),
- EXT4_MMP_MIN_CHECK_INTERVAL);
+ mmp_check_interval = clamp(EXT4_MMP_CHECK_MULT * diff / HZ,
+ EXT4_MMP_MIN_CHECK_INTERVAL,
+ EXT4_MMP_MAX_CHECK_INTERVAL);
mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
}
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index adae3caf175a..4b091c21908f 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -225,7 +225,7 @@ static int mext_page_mkuptodate(struct folio *folio, size_t from, size_t to)
do {
if (bh_offset(bh) + blocksize <= from)
continue;
- if (bh_offset(bh) > to)
+ if (bh_offset(bh) >= to)
break;
wait_on_buffer(bh);
if (buffer_uptodate(bh))
diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c
index 524d4658fa40..33c3a89396b1 100644
--- a/fs/ext4/orphan.c
+++ b/fs/ext4/orphan.c
@@ -109,11 +109,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
!inode_is_locked(inode));
- /*
- * Inode orphaned in orphan file or in orphan list?
- */
- if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) ||
- !list_empty(&EXT4_I(inode)->i_orphan))
+ if (ext4_inode_orphan_tracked(inode))
return 0;
/*
@@ -587,9 +583,20 @@ int ext4_init_orphan_info(struct super_block *sb)
ext4_msg(sb, KERN_ERR, "get orphan inode failed");
return PTR_ERR(inode);
}
+ /*
+ * This is just an artificial limit to prevent corrupted fs from
+ * consuming absurd amounts of memory when pinning blocks of orphan
+ * file in memory.
+ */
+ if (inode->i_size > 8 << 20) {
+ ext4_msg(sb, KERN_ERR, "orphan file too big: %llu",
+ (unsigned long long)inode->i_size);
+ ret = -EFSCORRUPTED;
+ goto out_put;
+ }
oi->of_blocks = inode->i_size >> sb->s_blocksize_bits;
oi->of_csum_seed = EXT4_I(inode)->i_csum_seed;
- oi->of_binfo = kmalloc_array(oi->of_blocks,
+ oi->of_binfo = kvmalloc_array(oi->of_blocks,
sizeof(struct ext4_orphan_block),
GFP_KERNEL);
if (!oi->of_binfo) {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7f2d4014d128..33e7c08c9529 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -265,6 +265,15 @@ struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
return __ext4_sb_bread_gfp(sb, block, 0, gfp);
}
+struct buffer_head *ext4_sb_bread_nofail(struct super_block *sb,
+ sector_t block)
+{
+ gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping,
+ ~__GFP_FS) | __GFP_MOVABLE | __GFP_NOFAIL;
+
+ return __ext4_sb_bread_gfp(sb, block, 0, gfp);
+}
+
void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block)
{
struct buffer_head *bh = bdev_getblk(sb->s_bdev, block,
@@ -1438,9 +1447,9 @@ static void ext4_free_in_core_inode(struct inode *inode)
static void ext4_destroy_inode(struct inode *inode)
{
- if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
+ if (ext4_inode_orphan_tracked(inode)) {
ext4_msg(inode->i_sb, KERN_ERR,
- "Inode %lu (%p): orphan list check failed!",
+ "Inode %lu (%p): inode tracked as orphan!",
inode->i_ino, EXT4_I(inode));
print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
EXT4_I(inode), sizeof(struct ext4_inode_info),
@@ -2466,7 +2475,7 @@ static int parse_apply_sb_mount_options(struct super_block *sb,
struct ext4_fs_context *m_ctx)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- char *s_mount_opts = NULL;
+ char s_mount_opts[65];
struct ext4_fs_context *s_ctx = NULL;
struct fs_context *fc = NULL;
int ret = -ENOMEM;
@@ -2474,15 +2483,11 @@ static int parse_apply_sb_mount_options(struct super_block *sb,
if (!sbi->s_es->s_mount_opts[0])
return 0;
- s_mount_opts = kstrndup(sbi->s_es->s_mount_opts,
- sizeof(sbi->s_es->s_mount_opts),
- GFP_KERNEL);
- if (!s_mount_opts)
- return ret;
+ strscpy_pad(s_mount_opts, sbi->s_es->s_mount_opts);
fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL);
if (!fc)
- goto out_free;
+ return -ENOMEM;
s_ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL);
if (!s_ctx)
@@ -2514,11 +2519,8 @@ parse_failed:
ret = 0;
out_free:
- if (fc) {
- ext4_fc_free(fc);
- kfree(fc);
- }
- kfree(s_mount_opts);
+ ext4_fc_free(fc);
+ kfree(fc);
return ret;
}
@@ -2964,11 +2966,11 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
}
if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) ||
- le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID)
+ ext4_get_resuid(es) != EXT4_DEF_RESUID)
SEQ_OPTS_PRINT("resuid=%u",
from_kuid_munged(&init_user_ns, sbi->s_resuid));
if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) ||
- le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID)
+ ext4_get_resgid(es) != EXT4_DEF_RESGID)
SEQ_OPTS_PRINT("resgid=%u",
from_kgid_munged(&init_user_ns, sbi->s_resgid));
def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
@@ -5283,8 +5285,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
ext4_set_def_opts(sb, es);
- sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
- sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
+ sbi->s_resuid = make_kuid(&init_user_ns, ext4_get_resuid(es));
+ sbi->s_resgid = make_kgid(&init_user_ns, ext4_get_resuid(es));
sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 5a6fe1513fd2..ce7253b3f549 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -251,6 +251,10 @@ check_xattrs(struct inode *inode, struct buffer_head *bh,
err_str = "invalid ea_ino";
goto errout;
}
+ if (ea_ino && !size) {
+ err_str = "invalid size in ea xattr";
+ goto errout;
+ }
if (size > EXT4_XATTR_SIZE_MAX) {
err_str = "e_value size too large";
goto errout;
@@ -1019,7 +1023,7 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
int ref_change)
{
struct ext4_iloc iloc;
- s64 ref_count;
+ u64 ref_count;
int ret;
inode_lock_nested(ea_inode, I_MUTEX_XATTR);
@@ -1029,13 +1033,17 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
goto out;
ref_count = ext4_xattr_inode_get_ref(ea_inode);
+ if ((ref_count == 0 && ref_change < 0) || (ref_count == U64_MAX && ref_change > 0)) {
+ ext4_error_inode(ea_inode, __func__, __LINE__, 0,
+ "EA inode %lu ref wraparound: ref_count=%lld ref_change=%d",
+ ea_inode->i_ino, ref_count, ref_change);
+ ret = -EFSCORRUPTED;
+ goto out;
+ }
ref_count += ref_change;
ext4_xattr_inode_set_ref(ea_inode, ref_count);
if (ref_change > 0) {
- WARN_ONCE(ref_count <= 0, "EA inode %lu ref_count=%lld",
- ea_inode->i_ino, ref_count);
-
if (ref_count == 1) {
WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u",
ea_inode->i_ino, ea_inode->i_nlink);
@@ -1044,9 +1052,6 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
ext4_orphan_del(handle, ea_inode);
}
} else {
- WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld",
- ea_inode->i_ino, ref_count);
-
if (ref_count == 0) {
WARN_ONCE(ea_inode->i_nlink != 1,
"EA inode %lu i_nlink=%u",
@@ -1530,7 +1535,7 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
WARN_ON_ONCE(ext4_handle_valid(journal_current_handle()) &&
!(current->flags & PF_MEMALLOC_NOFS));
- ea_data = kvmalloc(value_len, GFP_KERNEL);
+ ea_data = kvmalloc(value_len, GFP_NOFS);
if (!ea_data) {
mb_cache_entry_put(ea_inode_cache, ce);
return NULL;
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index db3831f7f2f5..bbe07e3a6c75 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -1442,6 +1442,34 @@ u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi)
return get_sectors_written(sbi->sb->s_bdev);
}
+static inline void stat_cp_time(struct cp_control *cpc, enum cp_time type)
+{
+ cpc->stats.times[type] = ktime_get();
+}
+
+static inline void check_cp_time(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+{
+ unsigned long long sb_diff, cur_diff;
+ enum cp_time ct;
+
+ sb_diff = (u64)ktime_ms_delta(sbi->cp_stats.times[CP_TIME_END],
+ sbi->cp_stats.times[CP_TIME_START]);
+ cur_diff = (u64)ktime_ms_delta(cpc->stats.times[CP_TIME_END],
+ cpc->stats.times[CP_TIME_START]);
+
+ if (cur_diff > sb_diff) {
+ sbi->cp_stats = cpc->stats;
+ if (cur_diff < CP_LONG_LATENCY_THRESHOLD)
+ return;
+
+ f2fs_warn(sbi, "checkpoint was blocked for %llu ms", cur_diff);
+ for (ct = CP_TIME_START; ct < CP_TIME_MAX - 1; ct++)
+ f2fs_warn(sbi, "Step#%d: %llu ms", ct,
+ (u64)ktime_ms_delta(cpc->stats.times[ct + 1],
+ cpc->stats.times[ct]));
+ }
+}
+
static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
@@ -1459,6 +1487,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* Flush all the NAT/SIT pages */
f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
+ stat_cp_time(cpc, CP_TIME_SYNC_META);
+
/* start to update checkpoint, cp ver is already updated previously */
ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi, true));
ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
@@ -1555,20 +1585,26 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* Here, we have one bio having CP pack except cp pack 2 page */
f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
+ stat_cp_time(cpc, CP_TIME_SYNC_CP_META);
+
/* Wait for all dirty meta pages to be submitted for IO */
f2fs_wait_on_all_pages(sbi, F2FS_DIRTY_META);
+ stat_cp_time(cpc, CP_TIME_WAIT_DIRTY_META);
/* wait for previous submitted meta pages writeback */
f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA);
+ stat_cp_time(cpc, CP_TIME_WAIT_CP_DATA);
/* flush all device cache */
err = f2fs_flush_device_cache(sbi);
if (err)
return err;
+ stat_cp_time(cpc, CP_TIME_FLUSH_DEVICE);
/* barrier and flush checkpoint cp pack 2 page if it can */
commit_checkpoint(sbi, ckpt, start_blk);
f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA);
+ stat_cp_time(cpc, CP_TIME_WAIT_LAST_CP);
/*
* invalidate intermediate page cache borrowed from meta inode which are
@@ -1613,6 +1649,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
unsigned long long ckpt_ver;
int err = 0;
+ stat_cp_time(cpc, CP_TIME_START);
+
if (f2fs_readonly(sbi->sb) || f2fs_hw_is_readonly(sbi))
return -EROFS;
@@ -1624,6 +1662,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
if (cpc->reason != CP_RESIZE)
f2fs_down_write(&sbi->cp_global_sem);
+ stat_cp_time(cpc, CP_TIME_LOCK);
+
if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) ||
((cpc->reason & CP_DISCARD) && !sbi->discard_blks)))
@@ -1639,6 +1679,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
if (err)
goto out;
+ stat_cp_time(cpc, CP_TIME_OP_LOCK);
+
trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops");
f2fs_flush_merged_writes(sbi);
@@ -1678,6 +1720,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
f2fs_flush_sit_entries(sbi, cpc);
+ stat_cp_time(cpc, CP_TIME_FLUSH_META);
+
/* save inmem log status */
f2fs_save_inmem_curseg(sbi);
@@ -1695,6 +1739,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
stat_inc_cp_count(sbi);
stop:
unblock_operations(sbi);
+ stat_cp_time(cpc, CP_TIME_END);
+ check_cp_time(sbi, cpc);
if (cpc->reason & CP_RECOVERY)
f2fs_notice(sbi, "checkpoint: version = %llx", ckpt_ver);
@@ -1778,6 +1824,7 @@ static void __checkpoint_and_complete_reqs(struct f2fs_sb_info *sbi)
llist_for_each_entry_safe(req, next, dispatch_list, llnode) {
diff = (u64)ktime_ms_delta(ktime_get(), req->queue_time);
req->ret = ret;
+ req->delta_time = diff;
complete(&req->wait);
sum_diff += diff;
@@ -1873,6 +1920,12 @@ int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi)
else
flush_remained_ckpt_reqs(sbi, &req);
+ if (unlikely(req.delta_time >= CP_LONG_LATENCY_THRESHOLD)) {
+ f2fs_warn_ratelimited(sbi,
+ "blocked on checkpoint for %u ms", cprc->peak_time);
+ dump_stack();
+ }
+
return req.ret;
}
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 5c1f47e45dab..6ad8d3bc6df7 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -1215,9 +1215,11 @@ int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock)
{
void *fsdata = NULL;
struct page *pagep;
+ struct page **rpages;
int log_cluster_size = F2FS_I(inode)->i_log_cluster_size;
pgoff_t start_idx = from >> (PAGE_SHIFT + log_cluster_size) <<
log_cluster_size;
+ int i;
int err;
err = f2fs_is_compressed_cluster(inode, start_idx);
@@ -1238,27 +1240,30 @@ int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock)
if (err <= 0)
return err;
- if (err > 0) {
- struct page **rpages = fsdata;
- int cluster_size = F2FS_I(inode)->i_cluster_size;
- int i;
-
- for (i = cluster_size - 1; i >= 0; i--) {
- struct folio *folio = page_folio(rpages[i]);
- loff_t start = folio->index << PAGE_SHIFT;
-
- if (from <= start) {
- folio_zero_segment(folio, 0, folio_size(folio));
- } else {
- folio_zero_segment(folio, from - start,
- folio_size(folio));
- break;
- }
- }
+ rpages = fsdata;
+
+ for (i = (1 << log_cluster_size) - 1; i >= 0; i--) {
+ struct folio *folio = page_folio(rpages[i]);
+ loff_t start = (loff_t)folio->index << PAGE_SHIFT;
+ loff_t offset = from > start ? from - start : 0;
- f2fs_compress_write_end(inode, fsdata, start_idx, true);
+ folio_zero_segment(folio, offset, folio_size(folio));
+
+ if (from >= start)
+ break;
}
- return 0;
+
+ f2fs_compress_write_end(inode, fsdata, start_idx, true);
+
+ err = filemap_write_and_wait_range(inode->i_mapping,
+ round_down(from, 1 << log_cluster_size << PAGE_SHIFT),
+ LLONG_MAX);
+ if (err)
+ return err;
+
+ truncate_pagecache(inode, from);
+
+ return f2fs_do_truncate_blocks(inode, round_up(from, PAGE_SIZE), lock);
}
static int f2fs_write_compressed_pages(struct compress_ctx *cc,
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7961e0ddfca3..ef38e62cda8f 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -733,9 +733,11 @@ static bool page_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio,
static bool io_type_is_mergeable(struct f2fs_bio_info *io,
struct f2fs_io_info *fio)
{
+ blk_opf_t mask = ~(REQ_PREFLUSH | REQ_FUA);
+
if (io->fio.op != fio->op)
return false;
- return io->fio.op_flags == fio->op_flags;
+ return (io->fio.op_flags & mask) == (fio->op_flags & mask);
}
static bool io_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio,
@@ -911,7 +913,7 @@ alloc_new:
if (fio->io_wbc)
wbc_account_cgroup_owner(fio->io_wbc, folio, folio_size(folio));
- inc_page_count(fio->sbi, WB_DATA_TYPE(data_folio, false));
+ inc_page_count(fio->sbi, WB_DATA_TYPE(folio, false));
*fio->last_block = fio->new_blkaddr;
*fio->bio = bio;
@@ -1083,7 +1085,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
}
/* This can handle encryption stuffs */
-static int f2fs_submit_page_read(struct inode *inode, struct folio *folio,
+static void f2fs_submit_page_read(struct inode *inode, struct folio *folio,
block_t blkaddr, blk_opf_t op_flags,
bool for_write)
{
@@ -1092,23 +1094,16 @@ static int f2fs_submit_page_read(struct inode *inode, struct folio *folio,
bio = f2fs_grab_read_bio(inode, blkaddr, 1, op_flags,
folio->index, for_write);
- if (IS_ERR(bio))
- return PTR_ERR(bio);
/* wait for GCed page writeback via META_MAPPING */
f2fs_wait_on_block_writeback(inode, blkaddr);
- if (!bio_add_folio(bio, folio, PAGE_SIZE, 0)) {
- iostat_update_and_unbind_ctx(bio);
- if (bio->bi_private)
- mempool_free(bio->bi_private, bio_post_read_ctx_pool);
- bio_put(bio);
- return -EFAULT;
- }
+ if (!bio_add_folio(bio, folio, PAGE_SIZE, 0))
+ f2fs_bug_on(sbi, 1);
+
inc_page_count(sbi, F2FS_RD_DATA);
f2fs_update_iostat(sbi, NULL, FS_DATA_READ_IO, F2FS_BLKSIZE);
f2fs_submit_read_bio(sbi, bio, DATA);
- return 0;
}
static void __set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr)
@@ -1265,10 +1260,8 @@ got_it:
return folio;
}
- err = f2fs_submit_page_read(inode, folio, dn.data_blkaddr,
+ f2fs_submit_page_read(inode, folio, dn.data_blkaddr,
op_flags, for_write);
- if (err)
- goto put_err;
return folio;
put_err:
@@ -1572,6 +1565,9 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag)
pgofs = (pgoff_t)map->m_lblk;
end = pgofs + maxblocks;
+ if (flag == F2FS_GET_BLOCK_PRECACHE)
+ mode = LOOKUP_NODE_RA;
+
next_dnode:
if (map->m_may_create) {
if (f2fs_lfs_mode(sbi))
@@ -1778,12 +1774,13 @@ sync_out:
if (map->m_flags & F2FS_MAP_MAPPED) {
unsigned int ofs = start_pgofs - map->m_lblk;
- f2fs_update_read_extent_cache_range(&dn,
- start_pgofs, map->m_pblk + ofs,
- map->m_len - ofs);
+ if (map->m_len > ofs)
+ f2fs_update_read_extent_cache_range(&dn,
+ start_pgofs, map->m_pblk + ofs,
+ map->m_len - ofs);
}
if (map->m_next_extent)
- *map->m_next_extent = pgofs + 1;
+ *map->m_next_extent = is_hole ? pgofs + 1 : pgofs;
}
f2fs_put_dnode(&dn);
unlock_out:
@@ -2145,16 +2142,10 @@ submit_and_realloc:
f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
bio = NULL;
}
- if (bio == NULL) {
+ if (bio == NULL)
bio = f2fs_grab_read_bio(inode, block_nr, nr_pages,
f2fs_ra_op_flags(rac), index,
false);
- if (IS_ERR(bio)) {
- ret = PTR_ERR(bio);
- bio = NULL;
- goto out;
- }
- }
/*
* If the page is under writeback, we need to wait for
@@ -2303,18 +2294,10 @@ submit_and_realloc:
bio = NULL;
}
- if (!bio) {
+ if (!bio)
bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages - i,
f2fs_ra_op_flags(rac),
folio->index, for_write);
- if (IS_ERR(bio)) {
- ret = PTR_ERR(bio);
- f2fs_decompress_end_io(dic, ret, true);
- f2fs_put_dnode(&dn);
- *bio_ret = NULL;
- return ret;
- }
- }
if (!bio_add_folio(bio, folio, blocksize, 0))
goto submit_and_realloc;
@@ -3639,11 +3622,9 @@ repeat:
err = -EFSCORRUPTED;
goto put_folio;
}
- err = f2fs_submit_page_read(use_cow ?
+ f2fs_submit_page_read(use_cow ?
F2FS_I(inode)->cow_inode : inode,
folio, blkaddr, 0, true);
- if (err)
- goto put_folio;
folio_lock(folio);
if (unlikely(folio->mapping != mapping)) {
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index fffd7749d6d1..48f4f98afb01 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -16,6 +16,21 @@
#include "xattr.h"
#include <trace/events/f2fs.h>
+static inline bool f2fs_should_fallback_to_linear(struct inode *dir)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+
+ switch (F2FS_OPTION(sbi).lookup_mode) {
+ case LOOKUP_PERF:
+ return false;
+ case LOOKUP_COMPAT:
+ return true;
+ case LOOKUP_AUTO:
+ return !sb_no_casefold_compat_fallback(sbi->sb);
+ }
+ return false;
+}
+
#if IS_ENABLED(CONFIG_UNICODE)
extern struct kmem_cache *f2fs_cf_name_slab;
#endif
@@ -366,7 +381,7 @@ start_find_entry:
out:
#if IS_ENABLED(CONFIG_UNICODE)
- if (!sb_no_casefold_compat_fallback(dir->i_sb) &&
+ if (f2fs_should_fallback_to_linear(dir) &&
IS_CASEFOLDED(dir) && !de && use_hash) {
use_hash = false;
goto start_find_entry;
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 199c1e7a83ef..33e09c453c70 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -604,7 +604,13 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi,
p = &(*p)->rb_right;
leftmost = false;
} else {
+ f2fs_err_ratelimited(sbi, "%s: corrupted extent, type: %d, "
+ "extent node in rb tree [%u, %u, %u], age [%llu, %llu], "
+ "extent node to insert [%u, %u, %u], age [%llu, %llu]",
+ __func__, et->type, en->ei.fofs, en->ei.blk, en->ei.len, en->ei.age,
+ en->ei.last_blocks, ei->fofs, ei->blk, ei->len, ei->age, ei->last_blocks);
f2fs_bug_on(sbi, 1);
+ return NULL;
}
}
@@ -664,6 +670,15 @@ static void __update_extent_tree_range(struct inode *inode,
if (!et)
return;
+ if (unlikely(len == 0)) {
+ f2fs_err_ratelimited(sbi, "%s: extent len is zero, type: %d, "
+ "extent [%u, %u, %u], age [%llu, %llu]",
+ __func__, type, tei->fofs, tei->blk, tei->len,
+ tei->age, tei->last_blocks);
+ f2fs_bug_on(sbi, 1);
+ return;
+ }
+
if (type == EX_READ)
trace_f2fs_update_read_extent_tree_range(inode, fofs, len,
tei->blk, 0);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 6e465bbc85ee..5b4e9548a231 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -131,6 +131,7 @@ extern const char *f2fs_fault_name[FAULT_MAX];
* string rather than using the MS_LAZYTIME flag, so this must remain.
*/
#define F2FS_MOUNT_LAZYTIME 0x40000000
+#define F2FS_MOUNT_RESERVE_NODE 0x80000000
#define F2FS_OPTION(sbi) ((sbi)->mount_opt)
#define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option)
@@ -155,6 +156,18 @@ enum blkzone_allocation_policy {
BLKZONE_ALLOC_PRIOR_CONV, /* Prioritize writing to conventional zones */
};
+enum bggc_io_aware_policy {
+ AWARE_ALL_IO, /* skip background GC if there is any kind of pending IO */
+ AWARE_READ_IO, /* skip background GC if there is pending read IO */
+ AWARE_NONE, /* don't aware IO for background GC */
+};
+
+enum device_allocation_policy {
+ ALLOCATE_FORWARD_NOHINT,
+ ALLOCATE_FORWARD_WITHIN_HINT,
+ ALLOCATE_FORWARD_FROM_HINT,
+};
+
/*
* An implementation of an rwsem that is explicitly unfair to readers. This
* prevents priority inversion when a low-priority reader acquires the read lock
@@ -172,6 +185,7 @@ struct f2fs_rwsem {
struct f2fs_mount_info {
unsigned int opt;
block_t root_reserved_blocks; /* root reserved blocks */
+ block_t root_reserved_nodes; /* root reserved nodes */
kuid_t s_resuid; /* reserved blocks for uid */
kgid_t s_resgid; /* reserved blocks for gid */
int active_logs; /* # of active logs */
@@ -212,6 +226,7 @@ struct f2fs_mount_info {
int compress_mode; /* compression mode */
unsigned char extensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */
unsigned char noextensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */
+ unsigned int lookup_mode;
};
#define F2FS_FEATURE_ENCRYPT 0x00000001
@@ -266,14 +281,36 @@ enum {
#define DEF_CP_INTERVAL 60 /* 60 secs */
#define DEF_IDLE_INTERVAL 5 /* 5 secs */
#define DEF_DISABLE_INTERVAL 5 /* 5 secs */
+#define DEF_ENABLE_INTERVAL 16 /* 16 secs */
#define DEF_DISABLE_QUICK_INTERVAL 1 /* 1 secs */
#define DEF_UMOUNT_DISCARD_TIMEOUT 5 /* 5 secs */
+enum cp_time {
+ CP_TIME_START, /* begin */
+ CP_TIME_LOCK, /* after cp_global_sem */
+ CP_TIME_OP_LOCK, /* after block_operation */
+ CP_TIME_FLUSH_META, /* after flush sit/nat */
+ CP_TIME_SYNC_META, /* after sync_meta_pages */
+ CP_TIME_SYNC_CP_META, /* after sync cp meta pages */
+ CP_TIME_WAIT_DIRTY_META,/* after wait on dirty meta */
+ CP_TIME_WAIT_CP_DATA, /* after wait on cp data */
+ CP_TIME_FLUSH_DEVICE, /* after flush device cache */
+ CP_TIME_WAIT_LAST_CP, /* after wait on last cp pack */
+ CP_TIME_END, /* after unblock_operation */
+ CP_TIME_MAX,
+};
+
+/* time cost stats of checkpoint */
+struct cp_stats {
+ ktime_t times[CP_TIME_MAX];
+};
+
struct cp_control {
int reason;
__u64 trim_start;
__u64 trim_end;
__u64 trim_minlen;
+ struct cp_stats stats;
};
/*
@@ -334,7 +371,10 @@ struct ckpt_req {
struct completion wait; /* completion for checkpoint done */
struct llist_node llnode; /* llist_node to be linked in wait queue */
int ret; /* return code of checkpoint */
- ktime_t queue_time; /* request queued time */
+ union {
+ ktime_t queue_time; /* request queued time */
+ ktime_t delta_time; /* time in queue */
+ };
};
struct ckpt_req_control {
@@ -350,6 +390,9 @@ struct ckpt_req_control {
unsigned int peak_time; /* peak wait time in msec until now */
};
+/* a time threshold that checkpoint was blocked for, unit: ms */
+#define CP_LONG_LATENCY_THRESHOLD 5000
+
/* for the bitmap indicate blocks to be discarded */
struct discard_entry {
struct list_head list; /* list head */
@@ -1375,6 +1418,7 @@ enum {
DISCARD_TIME,
GC_TIME,
DISABLE_TIME,
+ ENABLE_TIME,
UMOUNT_DISCARD_TIMEOUT,
MAX_TIME,
};
@@ -1454,6 +1498,12 @@ enum {
TOTAL_CALL = FOREGROUND,
};
+enum f2fs_lookup_mode {
+ LOOKUP_PERF,
+ LOOKUP_COMPAT,
+ LOOKUP_AUTO,
+};
+
static inline int f2fs_test_bit(unsigned int nr, char *addr);
static inline void f2fs_set_bit(unsigned int nr, char *addr);
static inline void f2fs_clear_bit(unsigned int nr, char *addr);
@@ -1643,6 +1693,7 @@ struct f2fs_sb_info {
unsigned long last_time[MAX_TIME]; /* to store time in jiffies */
long interval_time[MAX_TIME]; /* to store thresholds */
struct ckpt_req_control cprc_info; /* for checkpoint request control */
+ struct cp_stats cp_stats; /* for time stat of checkpoint */
struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */
@@ -1810,6 +1861,9 @@ struct f2fs_sb_info {
spinlock_t dev_lock; /* protect dirty_device */
bool aligned_blksize; /* all devices has the same logical blksize */
unsigned int first_seq_zone_segno; /* first segno in sequential zone */
+ unsigned int bggc_io_aware; /* For adjust the BG_GC priority when pending IO */
+ unsigned int allocate_section_hint; /* the boundary position between devices */
+ unsigned int allocate_section_policy; /* determine the section writing priority */
/* For write statistics */
u64 sectors_written_start;
@@ -2362,13 +2416,11 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs)
return ofs == XATTR_NODE_OFFSET;
}
-static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi,
+static inline bool __allow_reserved_root(struct f2fs_sb_info *sbi,
struct inode *inode, bool cap)
{
if (!inode)
return true;
- if (!test_opt(sbi, RESERVE_ROOT))
- return false;
if (IS_NOQUOTA(inode))
return true;
if (uid_eq(F2FS_OPTION(sbi).s_resuid, current_fsuid()))
@@ -2389,7 +2441,7 @@ static inline unsigned int get_available_block_count(struct f2fs_sb_info *sbi,
avail_user_block_count = sbi->user_block_count -
sbi->current_reserved_blocks;
- if (!__allow_reserved_blocks(sbi, inode, cap))
+ if (test_opt(sbi, RESERVE_ROOT) && !__allow_reserved_root(sbi, inode, cap))
avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks;
if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
@@ -2747,7 +2799,7 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
struct inode *inode, bool is_inode)
{
block_t valid_block_count;
- unsigned int valid_node_count;
+ unsigned int valid_node_count, avail_user_node_count;
unsigned int avail_user_block_count;
int err;
@@ -2769,15 +2821,20 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
spin_lock(&sbi->stat_lock);
valid_block_count = sbi->total_valid_block_count + 1;
- avail_user_block_count = get_available_block_count(sbi, inode, false);
+ avail_user_block_count = get_available_block_count(sbi, inode,
+ test_opt(sbi, RESERVE_NODE));
if (unlikely(valid_block_count > avail_user_block_count)) {
spin_unlock(&sbi->stat_lock);
goto enospc;
}
+ avail_user_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
+ if (test_opt(sbi, RESERVE_NODE) &&
+ !__allow_reserved_root(sbi, inode, true))
+ avail_user_node_count -= F2FS_OPTION(sbi).root_reserved_nodes;
valid_node_count = sbi->total_valid_node_count + 1;
- if (unlikely(valid_node_count > sbi->total_node_count)) {
+ if (unlikely(valid_node_count > avail_user_node_count)) {
spin_unlock(&sbi->stat_lock);
goto enospc;
}
@@ -3004,13 +3061,10 @@ static inline bool is_idle(struct f2fs_sb_info *sbi, int type)
if (sbi->gc_mode == GC_URGENT_HIGH)
return true;
- if (zoned_gc) {
- if (is_inflight_read_io(sbi))
- return false;
- } else {
- if (is_inflight_io(sbi, type))
- return false;
- }
+ if (sbi->bggc_io_aware == AWARE_READ_IO && is_inflight_read_io(sbi))
+ return false;
+ if (sbi->bggc_io_aware == AWARE_ALL_IO && is_inflight_io(sbi, type))
+ return false;
if (sbi->gc_mode == GC_URGENT_MID)
return true;
@@ -3770,6 +3824,7 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname);
* node.c
*/
struct node_info;
+enum node_type;
int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid);
bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type);
@@ -3792,7 +3847,8 @@ int f2fs_remove_inode_page(struct inode *inode);
struct folio *f2fs_new_inode_folio(struct inode *inode);
struct folio *f2fs_new_node_folio(struct dnode_of_data *dn, unsigned int ofs);
void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid);
-struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid);
+struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid,
+ enum node_type node_type);
struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino);
struct folio *f2fs_get_xnode_folio(struct f2fs_sb_info *sbi, pgoff_t xnid);
int f2fs_move_node_folio(struct folio *node_folio, int gc_type);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 42faaed6a02d..ffa045b39c01 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -35,15 +35,23 @@
#include <trace/events/f2fs.h>
#include <uapi/linux/f2fs.h>
-static void f2fs_zero_post_eof_page(struct inode *inode, loff_t new_size)
+static void f2fs_zero_post_eof_page(struct inode *inode,
+ loff_t new_size, bool lock)
{
loff_t old_size = i_size_read(inode);
if (old_size >= new_size)
return;
+ if (mapping_empty(inode->i_mapping))
+ return;
+
+ if (lock)
+ filemap_invalidate_lock(inode->i_mapping);
/* zero or drop pages only in range of [old_size, new_size] */
- truncate_pagecache(inode, old_size);
+ truncate_inode_pages_range(inode->i_mapping, old_size, new_size);
+ if (lock)
+ filemap_invalidate_unlock(inode->i_mapping);
}
static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf)
@@ -114,9 +122,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
f2fs_bug_on(sbi, f2fs_has_inline_data(inode));
- filemap_invalidate_lock(inode->i_mapping);
- f2fs_zero_post_eof_page(inode, (folio->index + 1) << PAGE_SHIFT);
- filemap_invalidate_unlock(inode->i_mapping);
+ f2fs_zero_post_eof_page(inode, (folio->index + 1) << PAGE_SHIFT, true);
file_update_time(vmf->vma->vm_file);
filemap_invalidate_lock_shared(inode->i_mapping);
@@ -904,8 +910,16 @@ int f2fs_truncate(struct inode *inode)
/* we should check inline_data size */
if (!f2fs_may_inline_data(inode)) {
err = f2fs_convert_inline_inode(inode);
- if (err)
+ if (err) {
+ /*
+ * Always truncate page #0 to avoid page cache
+ * leak in evict() path.
+ */
+ truncate_inode_pages_range(inode->i_mapping,
+ F2FS_BLK_TO_BYTES(0),
+ F2FS_BLK_END_BYTES(0));
return err;
+ }
}
err = f2fs_truncate_blocks(inode, i_size_read(inode), true);
@@ -1141,7 +1155,7 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
filemap_invalidate_lock(inode->i_mapping);
if (attr->ia_size > old_size)
- f2fs_zero_post_eof_page(inode, attr->ia_size);
+ f2fs_zero_post_eof_page(inode, attr->ia_size, false);
truncate_setsize(inode, attr->ia_size);
if (attr->ia_size <= old_size)
@@ -1260,9 +1274,7 @@ static int f2fs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
if (ret)
return ret;
- filemap_invalidate_lock(inode->i_mapping);
- f2fs_zero_post_eof_page(inode, offset + len);
- filemap_invalidate_unlock(inode->i_mapping);
+ f2fs_zero_post_eof_page(inode, offset + len, true);
pg_start = ((unsigned long long) offset) >> PAGE_SHIFT;
pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT;
@@ -1547,7 +1559,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len)
f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
- f2fs_zero_post_eof_page(inode, offset + len);
+ f2fs_zero_post_eof_page(inode, offset + len, false);
f2fs_lock_op(sbi);
f2fs_drop_extent_tree(inode);
@@ -1670,9 +1682,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
if (ret)
return ret;
- filemap_invalidate_lock(mapping);
- f2fs_zero_post_eof_page(inode, offset + len);
- filemap_invalidate_unlock(mapping);
+ f2fs_zero_post_eof_page(inode, offset + len, true);
pg_start = ((unsigned long long) offset) >> PAGE_SHIFT;
pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT;
@@ -1806,7 +1816,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(mapping);
- f2fs_zero_post_eof_page(inode, offset + len);
+ f2fs_zero_post_eof_page(inode, offset + len, false);
truncate_pagecache(inode, offset);
while (!ret && idx > pg_start) {
@@ -1864,9 +1874,7 @@ static int f2fs_expand_inode_data(struct inode *inode, loff_t offset,
if (err)
return err;
- filemap_invalidate_lock(inode->i_mapping);
- f2fs_zero_post_eof_page(inode, offset + len);
- filemap_invalidate_unlock(inode->i_mapping);
+ f2fs_zero_post_eof_page(inode, offset + len, true);
f2fs_balance_fs(sbi, true);
@@ -4914,9 +4922,8 @@ static ssize_t f2fs_write_checks(struct kiocb *iocb, struct iov_iter *from)
if (err)
return err;
- filemap_invalidate_lock(inode->i_mapping);
- f2fs_zero_post_eof_page(inode, iocb->ki_pos + iov_iter_count(from));
- filemap_invalidate_unlock(inode->i_mapping);
+ f2fs_zero_post_eof_page(inode,
+ iocb->ki_pos + iov_iter_count(from), true);
return count;
}
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 098e9f71421e..a7708cf80c04 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1071,7 +1071,7 @@ next_step:
}
/* phase == 2 */
- node_folio = f2fs_get_node_folio(sbi, nid);
+ node_folio = f2fs_get_node_folio(sbi, nid, NODE_TYPE_REGULAR);
if (IS_ERR(node_folio))
continue;
@@ -1145,7 +1145,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
nid = le32_to_cpu(sum->nid);
ofs_in_node = le16_to_cpu(sum->ofs_in_node);
- node_folio = f2fs_get_node_folio(sbi, nid);
+ node_folio = f2fs_get_node_folio(sbi, nid, NODE_TYPE_REGULAR);
if (IS_ERR(node_folio))
return false;
@@ -1794,6 +1794,13 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
struct folio *sum_folio = filemap_get_folio(META_MAPPING(sbi),
GET_SUM_BLOCK(sbi, segno));
+ if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, segno))) {
+ f2fs_err(sbi, "%s: segment %u is used by log",
+ __func__, segno);
+ f2fs_bug_on(sbi, 1);
+ goto skip;
+ }
+
if (get_valid_blocks(sbi, segno, false) == 0)
goto freed;
if (gc_type == BG_GC && __is_large_section(sbi) &&
@@ -1805,7 +1812,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
sum = folio_address(sum_folio);
if (type != GET_SUM_TYPE((&sum->footer))) {
- f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SSA and SIT",
+ f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SIT and SSA",
segno, type, GET_SUM_TYPE((&sum->footer)));
f2fs_stop_checkpoint(sbi, false,
STOP_CP_REASON_CORRUPTED_SUMMARY);
@@ -2068,6 +2075,13 @@ int f2fs_gc_range(struct f2fs_sb_info *sbi,
.iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS),
};
+ /*
+ * avoid migrating empty section, as it can be allocated by
+ * log in parallel.
+ */
+ if (!get_valid_blocks(sbi, segno, true))
+ continue;
+
if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, segno)))
continue;
@@ -2182,6 +2196,8 @@ static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs)
SM_I(sbi)->segment_count = (int)SM_I(sbi)->segment_count + segs;
MAIN_SEGS(sbi) = (int)MAIN_SEGS(sbi) + segs;
MAIN_SECS(sbi) += secs;
+ if (sbi->allocate_section_hint > MAIN_SECS(sbi))
+ sbi->allocate_section_hint = MAIN_SECS(sbi);
FREE_I(sbi)->free_sections = (int)FREE_I(sbi)->free_sections + secs;
FREE_I(sbi)->free_segments = (int)FREE_I(sbi)->free_segments + segs;
F2FS_CKPT(sbi)->user_block_count = cpu_to_le64(user_block_count + blks);
@@ -2189,6 +2205,9 @@ static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs)
if (f2fs_is_multi_device(sbi)) {
int last_dev = sbi->s_ndevs - 1;
+ sbi->allocate_section_hint = FDEV(0).total_segments /
+ SEGS_PER_SEC(sbi);
+
FDEV(last_dev).total_segments =
(int)FDEV(last_dev).total_segments + segs;
FDEV(last_dev).end_blk =
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 27743b93e186..482a362f2625 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -27,12 +27,17 @@ static struct kmem_cache *free_nid_slab;
static struct kmem_cache *nat_entry_set_slab;
static struct kmem_cache *fsync_node_entry_slab;
+static inline bool is_invalid_nid(struct f2fs_sb_info *sbi, nid_t nid)
+{
+ return nid < F2FS_ROOT_INO(sbi) || nid >= NM_I(sbi)->max_nid;
+}
+
/*
* Check whether the given nid is within node id range.
*/
int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
{
- if (unlikely(nid < F2FS_ROOT_INO(sbi) || nid >= NM_I(sbi)->max_nid)) {
+ if (unlikely(is_invalid_nid(sbi, nid))) {
set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: out-of-range nid=%x, run fsck to fix.",
__func__, nid);
@@ -871,7 +876,8 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
}
if (!done) {
- nfolio[i] = f2fs_get_node_folio(sbi, nids[i]);
+ nfolio[i] = f2fs_get_node_folio(sbi, nids[i],
+ NODE_TYPE_NON_INODE);
if (IS_ERR(nfolio[i])) {
err = PTR_ERR(nfolio[i]);
f2fs_folio_put(nfolio[0], false);
@@ -989,7 +995,7 @@ static int truncate_dnode(struct dnode_of_data *dn)
return 1;
/* get direct node */
- folio = f2fs_get_node_folio(sbi, dn->nid);
+ folio = f2fs_get_node_folio(sbi, dn->nid, NODE_TYPE_NON_INODE);
if (PTR_ERR(folio) == -ENOENT)
return 1;
else if (IS_ERR(folio))
@@ -1033,7 +1039,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr);
- folio = f2fs_get_node_folio(F2FS_I_SB(dn->inode), dn->nid);
+ folio = f2fs_get_node_folio(F2FS_I_SB(dn->inode), dn->nid,
+ NODE_TYPE_NON_INODE);
if (IS_ERR(folio)) {
trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(folio));
return PTR_ERR(folio);
@@ -1111,7 +1118,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
/* get indirect nodes in the path */
for (i = 0; i < idx + 1; i++) {
/* reference count'll be increased */
- folios[i] = f2fs_get_node_folio(F2FS_I_SB(dn->inode), nid[i]);
+ folios[i] = f2fs_get_node_folio(F2FS_I_SB(dn->inode), nid[i],
+ NODE_TYPE_NON_INODE);
if (IS_ERR(folios[i])) {
err = PTR_ERR(folios[i]);
idx = i - 1;
@@ -1496,21 +1504,37 @@ static int sanity_check_node_footer(struct f2fs_sb_info *sbi,
struct folio *folio, pgoff_t nid,
enum node_type ntype)
{
- if (unlikely(nid != nid_of_node(folio) ||
- (ntype == NODE_TYPE_INODE && !IS_INODE(folio)) ||
- (ntype == NODE_TYPE_XATTR &&
- !f2fs_has_xattr_block(ofs_of_node(folio))) ||
- time_to_inject(sbi, FAULT_INCONSISTENT_FOOTER))) {
- f2fs_warn(sbi, "inconsistent node block, node_type:%d, nid:%lu, "
- "node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]",
- ntype, nid, nid_of_node(folio), ino_of_node(folio),
- ofs_of_node(folio), cpver_of_node(folio),
- next_blkaddr_of_node(folio));
- set_sbi_flag(sbi, SBI_NEED_FSCK);
- f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER);
- return -EFSCORRUPTED;
+ if (unlikely(nid != nid_of_node(folio)))
+ goto out_err;
+
+ switch (ntype) {
+ case NODE_TYPE_INODE:
+ if (!IS_INODE(folio))
+ goto out_err;
+ break;
+ case NODE_TYPE_XATTR:
+ if (!f2fs_has_xattr_block(ofs_of_node(folio)))
+ goto out_err;
+ break;
+ case NODE_TYPE_NON_INODE:
+ if (IS_INODE(folio))
+ goto out_err;
+ break;
+ default:
+ break;
}
+ if (time_to_inject(sbi, FAULT_INCONSISTENT_FOOTER))
+ goto out_err;
return 0;
+out_err:
+ f2fs_warn(sbi, "inconsistent node block, node_type:%d, nid:%lu, "
+ "node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]",
+ ntype, nid, nid_of_node(folio), ino_of_node(folio),
+ ofs_of_node(folio), cpver_of_node(folio),
+ next_blkaddr_of_node(folio));
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER);
+ return -EFSCORRUPTED;
}
static struct folio *__get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid,
@@ -1546,7 +1570,7 @@ repeat:
if (unlikely(!folio_test_uptodate(folio))) {
err = -EIO;
- goto out_err;
+ goto out_put_err;
}
if (!f2fs_inode_chksum_verify(sbi, folio)) {
@@ -1567,9 +1591,10 @@ out_put_err:
return ERR_PTR(err);
}
-struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid)
+struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid,
+ enum node_type node_type)
{
- return __get_node_folio(sbi, nid, NULL, 0, NODE_TYPE_REGULAR);
+ return __get_node_folio(sbi, nid, NULL, 0, node_type);
}
struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino)
@@ -2634,6 +2659,16 @@ retry:
f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list));
i = list_first_entry(&nm_i->free_nid_list,
struct free_nid, list);
+
+ if (unlikely(is_invalid_nid(sbi, i->nid))) {
+ spin_unlock(&nm_i->nid_list_lock);
+ f2fs_err(sbi, "Corrupted nid %u in free_nid_list",
+ i->nid);
+ f2fs_stop_checkpoint(sbi, false,
+ STOP_CP_REASON_CORRUPTED_NID);
+ return false;
+ }
+
*nid = i->nid;
__move_free_nid(sbi, i, FREE_NID, PREALLOC_NID);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 030390543b54..9cb8dcf8d417 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -57,6 +57,7 @@ enum node_type {
NODE_TYPE_REGULAR,
NODE_TYPE_INODE,
NODE_TYPE_XATTR,
+ NODE_TYPE_NON_INODE,
};
/*
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 4cb3a91801b4..215e442db72c 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -548,7 +548,7 @@ got_it:
}
/* Get the node page */
- node_folio = f2fs_get_node_folio(sbi, nid);
+ node_folio = f2fs_get_node_folio(sbi, nid, NODE_TYPE_REGULAR);
if (IS_ERR(node_folio))
return PTR_ERR(node_folio);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index cc82d42ef14c..b45eace879d7 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2774,6 +2774,8 @@ static int get_new_segment(struct f2fs_sb_info *sbi,
unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone;
unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg);
unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg);
+ unsigned int alloc_policy = sbi->allocate_section_policy;
+ unsigned int alloc_hint = sbi->allocate_section_hint;
bool init = true;
int i;
int ret = 0;
@@ -2807,6 +2809,21 @@ static int get_new_segment(struct f2fs_sb_info *sbi,
}
#endif
+ /*
+ * Prevent allocate_section_hint from exceeding MAIN_SECS()
+ * due to desynchronization.
+ */
+ if (alloc_policy != ALLOCATE_FORWARD_NOHINT &&
+ alloc_hint > MAIN_SECS(sbi))
+ alloc_hint = MAIN_SECS(sbi);
+
+ if (alloc_policy == ALLOCATE_FORWARD_FROM_HINT &&
+ hint < alloc_hint)
+ hint = alloc_hint;
+ else if (alloc_policy == ALLOCATE_FORWARD_WITHIN_HINT &&
+ hint >= alloc_hint)
+ hint = 0;
+
find_other_zone:
secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint);
@@ -3672,7 +3689,8 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
if (file_is_hot(inode) ||
is_inode_flag_set(inode, FI_HOT_DATA) ||
- f2fs_is_cow_file(inode))
+ f2fs_is_cow_file(inode) ||
+ is_inode_flag_set(inode, FI_NEED_IPU))
return CURSEG_HOT_DATA;
return f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode),
inode->i_write_hint);
@@ -3936,12 +3954,18 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
int seg_type = log_type_to_seg_type(type);
bool keep_order = (f2fs_lfs_mode(fio->sbi) &&
seg_type == CURSEG_COLD_DATA);
+ int err;
if (keep_order)
f2fs_down_read(&fio->sbi->io_order_lock);
- if (f2fs_allocate_data_block(fio->sbi, folio, fio->old_blkaddr,
- &fio->new_blkaddr, sum, type, fio)) {
+ err = f2fs_allocate_data_block(fio->sbi, folio, fio->old_blkaddr,
+ &fio->new_blkaddr, sum, type, fio);
+ if (unlikely(err)) {
+ f2fs_err_ratelimited(fio->sbi,
+ "%s Failed to allocate data block, ino:%u, index:%lu, type:%d, old_blkaddr:0x%x, new_blkaddr:0x%x, err:%d",
+ __func__, fio->ino, folio->index, type,
+ fio->old_blkaddr, fio->new_blkaddr, err);
if (fscrypt_inode_uses_fs_layer_crypto(folio->mapping->host))
fscrypt_finalize_bounce_page(&fio->encrypted_page);
folio_end_writeback(folio);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 5e2ee5c686b1..1ce2c8abaf48 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -600,6 +600,16 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi)
return GET_SEC_FROM_SEG(sbi, reserved_segments(sbi));
}
+static inline unsigned int get_left_section_blocks(struct f2fs_sb_info *sbi,
+ enum log_type type, unsigned int segno)
+{
+ if (f2fs_lfs_mode(sbi) && __is_large_section(sbi))
+ return CAP_BLKS_PER_SEC(sbi) - SEGS_TO_BLKS(sbi,
+ (segno - GET_START_SEG_FROM_SEC(sbi, segno))) -
+ CURSEG_I(sbi, type)->next_blkoff;
+ return CAP_BLKS_PER_SEC(sbi) - get_ckpt_valid_blocks(sbi, segno, true);
+}
+
static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi,
unsigned int node_blocks, unsigned int data_blocks,
unsigned int dent_blocks)
@@ -614,14 +624,7 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi,
if (unlikely(segno == NULL_SEGNO))
return false;
- if (f2fs_lfs_mode(sbi) && __is_large_section(sbi)) {
- left_blocks = CAP_BLKS_PER_SEC(sbi) -
- SEGS_TO_BLKS(sbi, (segno - GET_START_SEG_FROM_SEC(sbi, segno))) -
- CURSEG_I(sbi, i)->next_blkoff;
- } else {
- left_blocks = CAP_BLKS_PER_SEC(sbi) -
- get_ckpt_valid_blocks(sbi, segno, true);
- }
+ left_blocks = get_left_section_blocks(sbi, i, segno);
blocks = i <= CURSEG_COLD_DATA ? data_blocks : node_blocks;
if (blocks > left_blocks)
@@ -634,14 +637,7 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi,
if (unlikely(segno == NULL_SEGNO))
return false;
- if (f2fs_lfs_mode(sbi) && __is_large_section(sbi)) {
- left_blocks = CAP_BLKS_PER_SEC(sbi) -
- SEGS_TO_BLKS(sbi, (segno - GET_START_SEG_FROM_SEC(sbi, segno))) -
- CURSEG_I(sbi, CURSEG_HOT_DATA)->next_blkoff;
- } else {
- left_blocks = CAP_BLKS_PER_SEC(sbi) -
- get_ckpt_valid_blocks(sbi, segno, true);
- }
+ left_blocks = get_left_section_blocks(sbi, CURSEG_HOT_DATA, segno);
if (dent_blocks > left_blocks)
return false;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 2619cbbd7d2d..fd8e7b0b2166 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -143,6 +143,7 @@ enum {
Opt_extent_cache,
Opt_data_flush,
Opt_reserve_root,
+ Opt_reserve_node,
Opt_resgid,
Opt_resuid,
Opt_mode,
@@ -181,6 +182,7 @@ enum {
Opt_nat_bits,
Opt_jqfmt,
Opt_checkpoint,
+ Opt_lookup_mode,
Opt_err,
};
@@ -244,6 +246,13 @@ static const struct constant_table f2fs_param_errors[] = {
{}
};
+static const struct constant_table f2fs_param_lookup_mode[] = {
+ {"perf", LOOKUP_PERF},
+ {"compat", LOOKUP_COMPAT},
+ {"auto", LOOKUP_AUTO},
+ {}
+};
+
static const struct fs_parameter_spec f2fs_param_specs[] = {
fsparam_enum("background_gc", Opt_gc_background, f2fs_param_background_gc),
fsparam_flag("disable_roll_forward", Opt_disable_roll_forward),
@@ -265,6 +274,7 @@ static const struct fs_parameter_spec f2fs_param_specs[] = {
fsparam_flag_no("extent_cache", Opt_extent_cache),
fsparam_flag("data_flush", Opt_data_flush),
fsparam_u32("reserve_root", Opt_reserve_root),
+ fsparam_u32("reserve_node", Opt_reserve_node),
fsparam_gid("resgid", Opt_resgid),
fsparam_uid("resuid", Opt_resuid),
fsparam_enum("mode", Opt_mode, f2fs_param_mode),
@@ -300,6 +310,7 @@ static const struct fs_parameter_spec f2fs_param_specs[] = {
fsparam_enum("memory", Opt_memory_mode, f2fs_param_memory_mode),
fsparam_flag("age_extent_cache", Opt_age_extent_cache),
fsparam_enum("errors", Opt_errors, f2fs_param_errors),
+ fsparam_enum("lookup_mode", Opt_lookup_mode, f2fs_param_lookup_mode),
{}
};
@@ -336,6 +347,8 @@ static match_table_t f2fs_checkpoint_tokens = {
#define F2FS_SPEC_discard_unit (1 << 21)
#define F2FS_SPEC_memory_mode (1 << 22)
#define F2FS_SPEC_errors (1 << 23)
+#define F2FS_SPEC_lookup_mode (1 << 24)
+#define F2FS_SPEC_reserve_node (1 << 25)
struct f2fs_fs_context {
struct f2fs_mount_info info;
@@ -437,22 +450,30 @@ static void f2fs_destroy_casefold_cache(void) { }
static inline void limit_reserve_root(struct f2fs_sb_info *sbi)
{
- block_t limit = min((sbi->user_block_count >> 3),
+ block_t block_limit = min((sbi->user_block_count >> 3),
sbi->user_block_count - sbi->reserved_blocks);
+ block_t node_limit = sbi->total_node_count >> 3;
/* limit is 12.5% */
if (test_opt(sbi, RESERVE_ROOT) &&
- F2FS_OPTION(sbi).root_reserved_blocks > limit) {
- F2FS_OPTION(sbi).root_reserved_blocks = limit;
+ F2FS_OPTION(sbi).root_reserved_blocks > block_limit) {
+ F2FS_OPTION(sbi).root_reserved_blocks = block_limit;
f2fs_info(sbi, "Reduce reserved blocks for root = %u",
F2FS_OPTION(sbi).root_reserved_blocks);
}
- if (!test_opt(sbi, RESERVE_ROOT) &&
+ if (test_opt(sbi, RESERVE_NODE) &&
+ F2FS_OPTION(sbi).root_reserved_nodes > node_limit) {
+ F2FS_OPTION(sbi).root_reserved_nodes = node_limit;
+ f2fs_info(sbi, "Reduce reserved nodes for root = %u",
+ F2FS_OPTION(sbi).root_reserved_nodes);
+ }
+ if (!test_opt(sbi, RESERVE_ROOT) && !test_opt(sbi, RESERVE_NODE) &&
(!uid_eq(F2FS_OPTION(sbi).s_resuid,
make_kuid(&init_user_ns, F2FS_DEF_RESUID)) ||
!gid_eq(F2FS_OPTION(sbi).s_resgid,
make_kgid(&init_user_ns, F2FS_DEF_RESGID))))
- f2fs_info(sbi, "Ignore s_resuid=%u, s_resgid=%u w/o reserve_root",
+ f2fs_info(sbi, "Ignore s_resuid=%u, s_resgid=%u w/o reserve_root"
+ " and reserve_node",
from_kuid_munged(&init_user_ns,
F2FS_OPTION(sbi).s_resuid),
from_kgid_munged(&init_user_ns,
@@ -847,6 +868,11 @@ static int f2fs_parse_param(struct fs_context *fc, struct fs_parameter *param)
F2FS_CTX_INFO(ctx).root_reserved_blocks = result.uint_32;
ctx->spec_mask |= F2FS_SPEC_reserve_root;
break;
+ case Opt_reserve_node:
+ ctx_set_opt(ctx, F2FS_MOUNT_RESERVE_NODE);
+ F2FS_CTX_INFO(ctx).root_reserved_nodes = result.uint_32;
+ ctx->spec_mask |= F2FS_SPEC_reserve_node;
+ break;
case Opt_resuid:
F2FS_CTX_INFO(ctx).s_resuid = result.uid;
ctx->spec_mask |= F2FS_SPEC_resuid;
@@ -994,6 +1020,10 @@ static int f2fs_parse_param(struct fs_context *fc, struct fs_parameter *param)
ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT);
break;
case Opt_checkpoint_enable:
+ F2FS_CTX_INFO(ctx).unusable_cap_perc = 0;
+ ctx->spec_mask |= F2FS_SPEC_checkpoint_disable_cap_perc;
+ F2FS_CTX_INFO(ctx).unusable_cap = 0;
+ ctx->spec_mask |= F2FS_SPEC_checkpoint_disable_cap;
ctx_clear_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT);
break;
default:
@@ -1149,6 +1179,10 @@ static int f2fs_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_nat_bits:
ctx_set_opt(ctx, F2FS_MOUNT_NAT_BITS);
break;
+ case Opt_lookup_mode:
+ F2FS_CTX_INFO(ctx).lookup_mode = result.uint_32;
+ ctx->spec_mask |= F2FS_SPEC_lookup_mode;
+ break;
}
return 0;
}
@@ -1191,7 +1225,11 @@ static int f2fs_check_quota_consistency(struct fs_context *fc,
goto err_jquota_change;
if (old_qname) {
- if (strcmp(old_qname, new_qname) == 0) {
+ if (!new_qname) {
+ f2fs_info(sbi, "remove qf_name %s",
+ old_qname);
+ continue;
+ } else if (strcmp(old_qname, new_qname) == 0) {
ctx->qname_mask &= ~(1 << i);
continue;
}
@@ -1430,6 +1468,14 @@ static int f2fs_check_opt_consistency(struct fs_context *fc,
ctx_clear_opt(ctx, F2FS_MOUNT_RESERVE_ROOT);
ctx->opt_mask &= ~F2FS_MOUNT_RESERVE_ROOT;
}
+ if (test_opt(sbi, RESERVE_NODE) &&
+ (ctx->opt_mask & F2FS_MOUNT_RESERVE_NODE) &&
+ ctx_test_opt(ctx, F2FS_MOUNT_RESERVE_NODE)) {
+ f2fs_info(sbi, "Preserve previous reserve_node=%u",
+ F2FS_OPTION(sbi).root_reserved_nodes);
+ ctx_clear_opt(ctx, F2FS_MOUNT_RESERVE_NODE);
+ ctx->opt_mask &= ~F2FS_MOUNT_RESERVE_NODE;
+ }
err = f2fs_check_test_dummy_encryption(fc, sb);
if (err)
@@ -1629,6 +1675,9 @@ static void f2fs_apply_options(struct fs_context *fc, struct super_block *sb)
if (ctx->spec_mask & F2FS_SPEC_reserve_root)
F2FS_OPTION(sbi).root_reserved_blocks =
F2FS_CTX_INFO(ctx).root_reserved_blocks;
+ if (ctx->spec_mask & F2FS_SPEC_reserve_node)
+ F2FS_OPTION(sbi).root_reserved_nodes =
+ F2FS_CTX_INFO(ctx).root_reserved_nodes;
if (ctx->spec_mask & F2FS_SPEC_resgid)
F2FS_OPTION(sbi).s_resgid = F2FS_CTX_INFO(ctx).s_resgid;
if (ctx->spec_mask & F2FS_SPEC_resuid)
@@ -1658,6 +1707,8 @@ static void f2fs_apply_options(struct fs_context *fc, struct super_block *sb)
F2FS_OPTION(sbi).memory_mode = F2FS_CTX_INFO(ctx).memory_mode;
if (ctx->spec_mask & F2FS_SPEC_errors)
F2FS_OPTION(sbi).errors = F2FS_CTX_INFO(ctx).errors;
+ if (ctx->spec_mask & F2FS_SPEC_lookup_mode)
+ F2FS_OPTION(sbi).lookup_mode = F2FS_CTX_INFO(ctx).lookup_mode;
f2fs_apply_compression(fc, sb);
f2fs_apply_test_dummy_encryption(fc, sb);
@@ -2349,9 +2400,11 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK)
seq_puts(seq, "fragment:block");
seq_printf(seq, ",active_logs=%u", F2FS_OPTION(sbi).active_logs);
- if (test_opt(sbi, RESERVE_ROOT))
- seq_printf(seq, ",reserve_root=%u,resuid=%u,resgid=%u",
+ if (test_opt(sbi, RESERVE_ROOT) || test_opt(sbi, RESERVE_NODE))
+ seq_printf(seq, ",reserve_root=%u,reserve_node=%u,resuid=%u,"
+ "resgid=%u",
F2FS_OPTION(sbi).root_reserved_blocks,
+ F2FS_OPTION(sbi).root_reserved_nodes,
from_kuid_munged(&init_user_ns,
F2FS_OPTION(sbi).s_resuid),
from_kgid_munged(&init_user_ns,
@@ -2422,6 +2475,13 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
if (test_opt(sbi, NAT_BITS))
seq_puts(seq, ",nat_bits");
+ if (F2FS_OPTION(sbi).lookup_mode == LOOKUP_PERF)
+ seq_show_option(seq, "lookup_mode", "perf");
+ else if (F2FS_OPTION(sbi).lookup_mode == LOOKUP_COMPAT)
+ seq_show_option(seq, "lookup_mode", "compat");
+ else if (F2FS_OPTION(sbi).lookup_mode == LOOKUP_AUTO)
+ seq_show_option(seq, "lookup_mode", "auto");
+
return 0;
}
@@ -2486,6 +2546,8 @@ static void default_options(struct f2fs_sb_info *sbi, bool remount)
#endif
f2fs_build_fault_attr(sbi, 0, 0, FAULT_ALL);
+
+ F2FS_OPTION(sbi).lookup_mode = LOOKUP_PERF;
}
#ifdef CONFIG_QUOTA
@@ -2566,21 +2628,39 @@ out_unlock:
restore_flag:
sbi->gc_mode = gc_mode;
sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */
+ f2fs_info(sbi, "f2fs_disable_checkpoint() finish, err:%d", err);
return err;
}
static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi)
{
- int retry = DEFAULT_RETRY_IO_COUNT;
+ unsigned int nr_pages = get_pages(sbi, F2FS_DIRTY_DATA) / 16;
+ long long start, writeback, end;
+
+ f2fs_info(sbi, "f2fs_enable_checkpoint() starts, meta: %lld, node: %lld, data: %lld",
+ get_pages(sbi, F2FS_DIRTY_META),
+ get_pages(sbi, F2FS_DIRTY_NODES),
+ get_pages(sbi, F2FS_DIRTY_DATA));
+
+ f2fs_update_time(sbi, ENABLE_TIME);
+
+ start = ktime_get();
/* we should flush all the data to keep data consistency */
- do {
- sync_inodes_sb(sbi->sb);
+ while (get_pages(sbi, F2FS_DIRTY_DATA)) {
+ writeback_inodes_sb_nr(sbi->sb, nr_pages, WB_REASON_SYNC);
f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
- } while (get_pages(sbi, F2FS_DIRTY_DATA) && retry--);
- if (unlikely(retry < 0))
- f2fs_warn(sbi, "checkpoint=enable has some unwritten data.");
+ if (f2fs_time_over(sbi, ENABLE_TIME))
+ break;
+ }
+ writeback = ktime_get();
+
+ sync_inodes_sb(sbi->sb);
+
+ if (unlikely(get_pages(sbi, F2FS_DIRTY_DATA)))
+ f2fs_warn(sbi, "checkpoint=enable has some unwritten data: %lld",
+ get_pages(sbi, F2FS_DIRTY_DATA));
f2fs_down_write(&sbi->gc_lock);
f2fs_dirty_to_prefree(sbi);
@@ -2593,6 +2673,12 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi)
/* Let's ensure there's no pending checkpoint anymore */
f2fs_flush_ckpt_thread(sbi);
+
+ end = ktime_get();
+
+ f2fs_info(sbi, "f2fs_enable_checkpoint() finishes, writeback:%llu, sync:%llu",
+ ktime_ms_delta(writeback, start),
+ ktime_ms_delta(end, writeback));
}
static int __f2fs_remount(struct fs_context *fc, struct super_block *sb)
@@ -4156,6 +4242,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
sbi->total_node_count = SEGS_TO_BLKS(sbi,
((le32_to_cpu(raw_super->segment_count_nat) / 2) *
NAT_ENTRY_PER_BLOCK));
+ sbi->allocate_section_hint = le32_to_cpu(raw_super->section_count);
+ sbi->allocate_section_policy = ALLOCATE_FORWARD_NOHINT;
F2FS_ROOT_INO(sbi) = le32_to_cpu(raw_super->root_ino);
F2FS_NODE_INO(sbi) = le32_to_cpu(raw_super->node_ino);
F2FS_META_INO(sbi) = le32_to_cpu(raw_super->meta_ino);
@@ -4179,6 +4267,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
sbi->interval_time[DISCARD_TIME] = DEF_IDLE_INTERVAL;
sbi->interval_time[GC_TIME] = DEF_IDLE_INTERVAL;
sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_INTERVAL;
+ sbi->interval_time[ENABLE_TIME] = DEF_ENABLE_INTERVAL;
sbi->interval_time[UMOUNT_DISCARD_TIMEOUT] =
DEF_UMOUNT_DISCARD_TIMEOUT;
clear_sbi_flag(sbi, SBI_NEED_FSCK);
@@ -4637,9 +4726,11 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
logical_blksize = bdev_logical_block_size(sbi->sb->s_bdev);
sbi->aligned_blksize = true;
+ sbi->bggc_io_aware = AWARE_ALL_IO;
#ifdef CONFIG_BLK_DEV_ZONED
sbi->max_open_zones = UINT_MAX;
sbi->blkzone_alloc_policy = BLKZONE_ALLOC_PRIOR_SEQ;
+ sbi->bggc_io_aware = AWARE_READ_IO;
#endif
for (i = 0; i < max_devices; i++) {
@@ -4667,6 +4758,8 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
SEGS_TO_BLKS(sbi,
FDEV(i).total_segments) - 1 +
le32_to_cpu(raw_super->segment0_blkaddr);
+ sbi->allocate_section_hint = FDEV(i).total_segments /
+ SEGS_PER_SEC(sbi);
} else {
FDEV(i).start_blk = FDEV(i - 1).end_blk + 1;
FDEV(i).end_blk = FDEV(i).start_blk +
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index f736052dea50..6d2a4fba68a2 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -281,6 +281,22 @@ static ssize_t encoding_flags_show(struct f2fs_attr *a,
le16_to_cpu(F2FS_RAW_SUPER(sbi)->s_encoding_flags));
}
+static ssize_t effective_lookup_mode_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ switch (F2FS_OPTION(sbi).lookup_mode) {
+ case LOOKUP_PERF:
+ return sysfs_emit(buf, "perf\n");
+ case LOOKUP_COMPAT:
+ return sysfs_emit(buf, "compat\n");
+ case LOOKUP_AUTO:
+ if (sb_no_casefold_compat_fallback(sbi->sb))
+ return sysfs_emit(buf, "auto:perf\n");
+ return sysfs_emit(buf, "auto:compat\n");
+ }
+ return 0;
+}
+
static ssize_t mounted_time_sec_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
@@ -866,6 +882,27 @@ out:
return count;
}
+ if (!strcmp(a->attr.name, "bggc_io_aware")) {
+ if (t < AWARE_ALL_IO || t > AWARE_NONE)
+ return -EINVAL;
+ sbi->bggc_io_aware = t;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "allocate_section_hint")) {
+ if (t < 0 || t > MAIN_SECS(sbi))
+ return -EINVAL;
+ sbi->allocate_section_hint = t;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "allocate_section_policy")) {
+ if (t < ALLOCATE_FORWARD_NOHINT || t > ALLOCATE_FORWARD_FROM_HINT)
+ return -EINVAL;
+ sbi->allocate_section_policy = t;
+ return count;
+ }
+
*ui = (unsigned int)t;
return count;
@@ -1138,6 +1175,8 @@ F2FS_SBI_GENERAL_RW_ATTR(max_victim_search);
F2FS_SBI_GENERAL_RW_ATTR(migration_granularity);
F2FS_SBI_GENERAL_RW_ATTR(migration_window_granularity);
F2FS_SBI_GENERAL_RW_ATTR(dir_level);
+F2FS_SBI_GENERAL_RW_ATTR(allocate_section_hint);
+F2FS_SBI_GENERAL_RW_ATTR(allocate_section_policy);
#ifdef CONFIG_F2FS_IOSTAT
F2FS_SBI_GENERAL_RW_ATTR(iostat_enable);
F2FS_SBI_GENERAL_RW_ATTR(iostat_period_ms);
@@ -1175,6 +1214,7 @@ F2FS_SBI_GENERAL_RW_ATTR(blkzone_alloc_policy);
#endif
F2FS_SBI_GENERAL_RW_ATTR(carve_out);
F2FS_SBI_GENERAL_RW_ATTR(reserved_pin_section);
+F2FS_SBI_GENERAL_RW_ATTR(bggc_io_aware);
/* STAT_INFO ATTR */
#ifdef CONFIG_F2FS_STAT_FS
@@ -1211,6 +1251,7 @@ F2FS_GENERAL_RO_ATTR(current_reserved_blocks);
F2FS_GENERAL_RO_ATTR(unusable);
F2FS_GENERAL_RO_ATTR(encoding);
F2FS_GENERAL_RO_ATTR(encoding_flags);
+F2FS_GENERAL_RO_ATTR(effective_lookup_mode);
F2FS_GENERAL_RO_ATTR(mounted_time_sec);
F2FS_GENERAL_RO_ATTR(main_blkaddr);
F2FS_GENERAL_RO_ATTR(pending_discard);
@@ -1303,6 +1344,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(discard_idle_interval),
ATTR_LIST(gc_idle_interval),
ATTR_LIST(umount_discard_timeout),
+ ATTR_LIST(bggc_io_aware),
#ifdef CONFIG_F2FS_IOSTAT
ATTR_LIST(iostat_enable),
ATTR_LIST(iostat_period_ms),
@@ -1329,6 +1371,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(current_reserved_blocks),
ATTR_LIST(encoding),
ATTR_LIST(encoding_flags),
+ ATTR_LIST(effective_lookup_mode),
ATTR_LIST(mounted_time_sec),
#ifdef CONFIG_F2FS_STAT_FS
ATTR_LIST(cp_foreground_calls),
@@ -1371,6 +1414,8 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(max_read_extent_count),
ATTR_LIST(carve_out),
ATTR_LIST(reserved_pin_section),
+ ATTR_LIST(allocate_section_hint),
+ ATTR_LIST(allocate_section_policy),
NULL,
};
ATTRIBUTE_GROUPS(f2fs);
@@ -1723,12 +1768,15 @@ static int __maybe_unused disk_map_seq_show(struct seq_file *seq,
seq_printf(seq, " Main : 0x%010x (%10d)\n",
SM_I(sbi)->main_blkaddr,
le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_main));
- seq_printf(seq, " # of Sections : %12d\n",
- le32_to_cpu(F2FS_RAW_SUPER(sbi)->section_count));
+ seq_printf(seq, " Block size : %12lu KB\n", F2FS_BLKSIZE >> 10);
+ seq_printf(seq, " Segment size : %12d MB\n",
+ (BLKS_PER_SEG(sbi) << (F2FS_BLKSIZE_BITS - 10)) >> 10);
seq_printf(seq, " Segs/Sections : %12d\n",
SEGS_PER_SEC(sbi));
seq_printf(seq, " Section size : %12d MB\n",
- SEGS_PER_SEC(sbi) << 1);
+ (BLKS_PER_SEC(sbi) << (F2FS_BLKSIZE_BITS - 10)) >> 10);
+ seq_printf(seq, " # of Sections : %12d\n",
+ le32_to_cpu(F2FS_RAW_SUPER(sbi)->section_count));
if (!f2fs_is_multi_device(sbi))
return 0;
@@ -1742,6 +1790,69 @@ static int __maybe_unused disk_map_seq_show(struct seq_file *seq,
return 0;
}
+static int __maybe_unused donation_list_seq_show(struct seq_file *seq,
+ void *offset)
+{
+ struct super_block *sb = seq->private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct inode *inode;
+ struct f2fs_inode_info *fi;
+ struct dentry *dentry;
+ char *buf, *path;
+ int i;
+
+ buf = f2fs_getname(sbi);
+ if (!buf)
+ return 0;
+
+ seq_printf(seq, "Donation List\n");
+ seq_printf(seq, " # of files : %u\n", sbi->donate_files);
+ seq_printf(seq, " %-50s %10s %20s %20s %22s\n",
+ "File path", "Status", "Donation offset (kb)",
+ "Donation size (kb)", "File cached size (kb)");
+ seq_printf(seq, "---\n");
+
+ for (i = 0; i < sbi->donate_files; i++) {
+ spin_lock(&sbi->inode_lock[DONATE_INODE]);
+ if (list_empty(&sbi->inode_list[DONATE_INODE])) {
+ spin_unlock(&sbi->inode_lock[DONATE_INODE]);
+ break;
+ }
+ fi = list_first_entry(&sbi->inode_list[DONATE_INODE],
+ struct f2fs_inode_info, gdonate_list);
+ list_move_tail(&fi->gdonate_list, &sbi->inode_list[DONATE_INODE]);
+ inode = igrab(&fi->vfs_inode);
+ spin_unlock(&sbi->inode_lock[DONATE_INODE]);
+
+ if (!inode)
+ continue;
+
+ inode_lock_shared(inode);
+
+ dentry = d_find_alias(inode);
+ if (!dentry) {
+ path = NULL;
+ } else {
+ path = dentry_path_raw(dentry, buf, PATH_MAX);
+ if (IS_ERR(path))
+ goto next;
+ }
+ seq_printf(seq, " %-50s %10s %20llu %20llu %22llu\n",
+ path ? path : "<unlinked>",
+ is_inode_flag_set(inode, FI_DONATE_FINISHED) ?
+ "Evicted" : "Donated",
+ (loff_t)fi->donate_start << (PAGE_SHIFT - 10),
+ (loff_t)(fi->donate_end + 1) << (PAGE_SHIFT - 10),
+ (loff_t)inode->i_mapping->nrpages << (PAGE_SHIFT - 10));
+next:
+ dput(dentry);
+ inode_unlock_shared(inode);
+ iput(inode);
+ }
+ f2fs_putname(buf);
+ return 0;
+}
+
#ifdef CONFIG_F2FS_FAULT_INJECTION
static int __maybe_unused inject_stats_seq_show(struct seq_file *seq,
void *offset)
@@ -1851,6 +1962,8 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi)
discard_plist_seq_show, sb);
proc_create_single_data("disk_map", 0444, sbi->s_proc,
disk_map_seq_show, sb);
+ proc_create_single_data("donation_list", 0444, sbi->s_proc,
+ donation_list_seq_show, sb);
#ifdef CONFIG_F2FS_FAULT_INJECTION
proc_create_single_data("inject_stats", 0444, sbi->s_proc,
inject_stats_seq_show, sb);
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index acbec5bdd521..92b091783966 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -1209,7 +1209,7 @@ EXPORT_SYMBOL_GPL(fat_alloc_new_dir);
static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots,
int *nr_cluster, struct msdos_dir_entry **de,
- struct buffer_head **bh, loff_t *i_pos)
+ struct buffer_head **bh)
{
struct super_block *sb = dir->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
@@ -1269,7 +1269,6 @@ static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots,
get_bh(bhs[n]);
*bh = bhs[n];
*de = (struct msdos_dir_entry *)((*bh)->b_data + offset);
- *i_pos = fat_make_i_pos(sb, *bh, *de);
/* Second stage: clear the rest of cluster, and write outs */
err = fat_zeroed_cluster(dir, start_blknr, ++n, bhs, MAX_BUF_PER_PAGE);
@@ -1298,7 +1297,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
struct buffer_head *bh, *prev, *bhs[3]; /* 32*slots (672bytes) */
struct msdos_dir_entry *de;
int err, free_slots, i, nr_bhs;
- loff_t pos, i_pos;
+ loff_t pos;
sinfo->nr_slots = nr_slots;
@@ -1386,7 +1385,7 @@ found:
* add the cluster to dir.
*/
cluster = fat_add_new_entries(dir, slots, nr_slots, &nr_cluster,
- &de, &bh, &i_pos);
+ &de, &bh);
if (cluster < 0) {
err = cluster;
goto error_remove;
diff --git a/fs/file_table.c b/fs/file_table.c
index 81c72576e548..b223d873e48b 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -54,7 +54,7 @@ struct backing_file {
#define backing_file(f) container_of(f, struct backing_file, file)
-struct path *backing_file_user_path(const struct file *f)
+const struct path *backing_file_user_path(const struct file *f)
{
return &backing_file(f)->user_path;
}
@@ -171,7 +171,7 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
* the respective member when opening the file.
*/
mutex_init(&f->f_pos_lock);
- memset(&f->f_path, 0, sizeof(f->f_path));
+ memset(&f->__f_path, 0, sizeof(f->f_path));
memset(&f->f_ra, 0, sizeof(f->f_ra));
f->f_flags = flags;
@@ -319,7 +319,7 @@ struct file *alloc_empty_backing_file(int flags, const struct cred *cred)
static void file_init_path(struct file *file, const struct path *path,
const struct file_operations *fop)
{
- file->f_path = *path;
+ file->__f_path = *path;
file->f_inode = path->dentry->d_inode;
file->f_mapping = path->dentry->d_inode->i_mapping;
file->f_wb_err = filemap_sample_wb_err(file->f_mapping);
diff --git a/fs/fs_context.c b/fs/fs_context.c
index 666e61753aed..93b7ebf8d927 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -161,25 +161,24 @@ int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param)
EXPORT_SYMBOL(vfs_parse_fs_param);
/**
- * vfs_parse_fs_string - Convenience function to just parse a string.
+ * vfs_parse_fs_qstr - Convenience function to just parse a string.
* @fc: Filesystem context.
* @key: Parameter name.
* @value: Default value.
- * @v_size: Maximum number of bytes in the value.
*/
-int vfs_parse_fs_string(struct fs_context *fc, const char *key,
- const char *value, size_t v_size)
+int vfs_parse_fs_qstr(struct fs_context *fc, const char *key,
+ const struct qstr *value)
{
int ret;
struct fs_parameter param = {
.key = key,
.type = fs_value_is_flag,
- .size = v_size,
+ .size = value ? value->len : 0,
};
if (value) {
- param.string = kmemdup_nul(value, v_size, GFP_KERNEL);
+ param.string = kmemdup_nul(value->name, value->len, GFP_KERNEL);
if (!param.string)
return -ENOMEM;
param.type = fs_value_is_string;
@@ -189,7 +188,7 @@ int vfs_parse_fs_string(struct fs_context *fc, const char *key,
kfree(param.string);
return ret;
}
-EXPORT_SYMBOL(vfs_parse_fs_string);
+EXPORT_SYMBOL(vfs_parse_fs_qstr);
/**
* vfs_parse_monolithic_sep - Parse key[=val][,key[=val]]* mount data
@@ -218,16 +217,14 @@ int vfs_parse_monolithic_sep(struct fs_context *fc, void *data,
while ((key = sep(&options)) != NULL) {
if (*key) {
- size_t v_len = 0;
char *value = strchr(key, '=');
if (value) {
if (unlikely(value == key))
continue;
*value++ = 0;
- v_len = strlen(value);
}
- ret = vfs_parse_fs_string(fc, key, value, v_len);
+ ret = vfs_parse_fs_string(fc, key, value);
if (ret < 0)
break;
}
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index a774166264de..3a4ae632c94a 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -13,7 +13,7 @@ config FUSE_FS
although chances are your distribution already has that library
installed if you've installed the "fuse" package itself.
- See <file:Documentation/filesystems/fuse.rst> for more information.
+ See <file:Documentation/filesystems/fuse/fuse.rst> for more information.
See <file:Documentation/Changes> for needed library/utility version.
If you want to develop a userspace FS, or if you want to use
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 3f0f312a31c1..22ad9538dfc4 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -10,10 +10,11 @@ obj-$(CONFIG_FUSE_FS) += fuse.o
obj-$(CONFIG_CUSE) += cuse.o
obj-$(CONFIG_VIRTIO_FS) += virtiofs.o
-fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o
+fuse-y := trace.o # put trace.o first so we see ftrace errors sooner
+fuse-y += dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o
fuse-y += iomode.o
fuse-$(CONFIG_FUSE_DAX) += dax.o
-fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o
+fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o backing.o
fuse-$(CONFIG_SYSCTL) += sysctl.o
fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o
diff --git a/fs/fuse/backing.c b/fs/fuse/backing.c
new file mode 100644
index 000000000000..4afda419dd14
--- /dev/null
+++ b/fs/fuse/backing.c
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * FUSE passthrough to backing file.
+ *
+ * Copyright (c) 2023 CTERA Networks.
+ */
+
+#include "fuse_i.h"
+
+#include <linux/file.h>
+
+struct fuse_backing *fuse_backing_get(struct fuse_backing *fb)
+{
+ if (fb && refcount_inc_not_zero(&fb->count))
+ return fb;
+ return NULL;
+}
+
+static void fuse_backing_free(struct fuse_backing *fb)
+{
+ pr_debug("%s: fb=0x%p\n", __func__, fb);
+
+ if (fb->file)
+ fput(fb->file);
+ put_cred(fb->cred);
+ kfree_rcu(fb, rcu);
+}
+
+void fuse_backing_put(struct fuse_backing *fb)
+{
+ if (fb && refcount_dec_and_test(&fb->count))
+ fuse_backing_free(fb);
+}
+
+void fuse_backing_files_init(struct fuse_conn *fc)
+{
+ idr_init(&fc->backing_files_map);
+}
+
+static int fuse_backing_id_alloc(struct fuse_conn *fc, struct fuse_backing *fb)
+{
+ int id;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock(&fc->lock);
+ /* FIXME: xarray might be space inefficient */
+ id = idr_alloc_cyclic(&fc->backing_files_map, fb, 1, 0, GFP_ATOMIC);
+ spin_unlock(&fc->lock);
+ idr_preload_end();
+
+ WARN_ON_ONCE(id == 0);
+ return id;
+}
+
+static struct fuse_backing *fuse_backing_id_remove(struct fuse_conn *fc,
+ int id)
+{
+ struct fuse_backing *fb;
+
+ spin_lock(&fc->lock);
+ fb = idr_remove(&fc->backing_files_map, id);
+ spin_unlock(&fc->lock);
+
+ return fb;
+}
+
+static int fuse_backing_id_free(int id, void *p, void *data)
+{
+ struct fuse_backing *fb = p;
+
+ WARN_ON_ONCE(refcount_read(&fb->count) != 1);
+ fuse_backing_free(fb);
+ return 0;
+}
+
+void fuse_backing_files_free(struct fuse_conn *fc)
+{
+ idr_for_each(&fc->backing_files_map, fuse_backing_id_free, NULL);
+ idr_destroy(&fc->backing_files_map);
+}
+
+int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map)
+{
+ struct file *file;
+ struct super_block *backing_sb;
+ struct fuse_backing *fb = NULL;
+ int res;
+
+ pr_debug("%s: fd=%d flags=0x%x\n", __func__, map->fd, map->flags);
+
+ /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */
+ res = -EPERM;
+ if (!fc->passthrough || !capable(CAP_SYS_ADMIN))
+ goto out;
+
+ res = -EINVAL;
+ if (map->flags || map->padding)
+ goto out;
+
+ file = fget_raw(map->fd);
+ res = -EBADF;
+ if (!file)
+ goto out;
+
+ /* read/write/splice/mmap passthrough only relevant for regular files */
+ res = d_is_dir(file->f_path.dentry) ? -EISDIR : -EINVAL;
+ if (!d_is_reg(file->f_path.dentry))
+ goto out_fput;
+
+ backing_sb = file_inode(file)->i_sb;
+ res = -ELOOP;
+ if (backing_sb->s_stack_depth >= fc->max_stack_depth)
+ goto out_fput;
+
+ fb = kmalloc(sizeof(struct fuse_backing), GFP_KERNEL);
+ res = -ENOMEM;
+ if (!fb)
+ goto out_fput;
+
+ fb->file = file;
+ fb->cred = prepare_creds();
+ refcount_set(&fb->count, 1);
+
+ res = fuse_backing_id_alloc(fc, fb);
+ if (res < 0) {
+ fuse_backing_free(fb);
+ fb = NULL;
+ }
+
+out:
+ pr_debug("%s: fb=0x%p, ret=%i\n", __func__, fb, res);
+
+ return res;
+
+out_fput:
+ fput(file);
+ goto out;
+}
+
+int fuse_backing_close(struct fuse_conn *fc, int backing_id)
+{
+ struct fuse_backing *fb = NULL;
+ int err;
+
+ pr_debug("%s: backing_id=%d\n", __func__, backing_id);
+
+ /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */
+ err = -EPERM;
+ if (!fc->passthrough || !capable(CAP_SYS_ADMIN))
+ goto out;
+
+ err = -EINVAL;
+ if (backing_id <= 0)
+ goto out;
+
+ err = -ENOENT;
+ fb = fuse_backing_id_remove(fc, backing_id);
+ if (!fb)
+ goto out;
+
+ fuse_backing_put(fb);
+ err = 0;
+out:
+ pr_debug("%s: fb=0x%p, err=%i\n", __func__, fb, err);
+
+ return err;
+}
+
+struct fuse_backing *fuse_backing_lookup(struct fuse_conn *fc, int backing_id)
+{
+ struct fuse_backing *fb;
+
+ rcu_read_lock();
+ fb = idr_find(&fc->backing_files_map, backing_id);
+ fb = fuse_backing_get(fb);
+ rcu_read_unlock();
+
+ return fb;
+}
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index b39844d75a80..28c96961e85d 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -52,6 +52,7 @@
#include <linux/user_namespace.h>
#include "fuse_i.h"
+#include "fuse_dev_i.h"
#define CUSE_CONNTBL_LEN 64
@@ -547,7 +548,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file)
*/
static int cuse_channel_release(struct inode *inode, struct file *file)
{
- struct fuse_dev *fud = file->private_data;
+ struct fuse_dev *fud = __fuse_get_dev(file);
struct cuse_conn *cc = fc_to_cc(fud->fc);
/* remove from the conntbl, no more access from this point on */
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ad8645c0f9fe..132f38619d70 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -25,7 +25,6 @@
#include <linux/sched.h>
#include <linux/seq_file.h>
-#define CREATE_TRACE_POINTS
#include "fuse_trace.h"
MODULE_ALIAS_MISCDEV(FUSE_MINOR);
@@ -207,8 +206,9 @@ static struct fuse_req *fuse_get_req(struct mnt_idmap *idmap,
if (fuse_block_alloc(fc, for_background)) {
err = -EINTR;
- if (wait_event_killable_exclusive(fc->blocked_waitq,
- !fuse_block_alloc(fc, for_background)))
+ if (wait_event_state_exclusive(fc->blocked_waitq,
+ !fuse_block_alloc(fc, for_background),
+ (TASK_KILLABLE | TASK_FREEZABLE)))
goto out;
}
/* Matches smp_wmb() in fuse_set_initialized() */
@@ -322,6 +322,7 @@ unsigned int fuse_req_hash(u64 unique)
{
return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS);
}
+EXPORT_SYMBOL_GPL(fuse_req_hash);
/*
* A new request is available, wake fiq->waitq
@@ -369,12 +370,32 @@ void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
}
}
+static inline void fuse_request_assign_unique_locked(struct fuse_iqueue *fiq,
+ struct fuse_req *req)
+{
+ if (req->in.h.opcode != FUSE_NOTIFY_REPLY)
+ req->in.h.unique = fuse_get_unique_locked(fiq);
+
+ /* tracepoint captures in.h.unique and in.h.len */
+ trace_fuse_request_send(req);
+}
+
+inline void fuse_request_assign_unique(struct fuse_iqueue *fiq,
+ struct fuse_req *req)
+{
+ if (req->in.h.opcode != FUSE_NOTIFY_REPLY)
+ req->in.h.unique = fuse_get_unique(fiq);
+
+ /* tracepoint captures in.h.unique and in.h.len */
+ trace_fuse_request_send(req);
+}
+EXPORT_SYMBOL_GPL(fuse_request_assign_unique);
+
static void fuse_dev_queue_req(struct fuse_iqueue *fiq, struct fuse_req *req)
{
spin_lock(&fiq->lock);
if (fiq->connected) {
- if (req->in.h.opcode != FUSE_NOTIFY_REPLY)
- req->in.h.unique = fuse_get_unique_locked(fiq);
+ fuse_request_assign_unique_locked(fiq, req);
list_add_tail(&req->list, &fiq->pending);
fuse_dev_wake_and_unlock(fiq);
} else {
@@ -397,7 +418,6 @@ static void fuse_send_one(struct fuse_iqueue *fiq, struct fuse_req *req)
req->in.h.len = sizeof(struct fuse_in_header) +
fuse_len_args(req->args->in_numargs,
(struct fuse_arg *) req->args->in_args);
- trace_fuse_request_send(req);
fiq->ops->send_req(fiq, req);
}
@@ -687,10 +707,10 @@ static bool fuse_request_queue_background_uring(struct fuse_conn *fc,
{
struct fuse_iqueue *fiq = &fc->iq;
- req->in.h.unique = fuse_get_unique(fiq);
req->in.h.len = sizeof(struct fuse_in_header) +
fuse_len_args(req->args->in_numargs,
(struct fuse_arg *) req->args->in_args);
+ fuse_request_assign_unique(fiq, req);
return fuse_uring_queue_bq_req(req);
}
@@ -1528,14 +1548,34 @@ static int fuse_dev_open(struct inode *inode, struct file *file)
return 0;
}
+struct fuse_dev *fuse_get_dev(struct file *file)
+{
+ struct fuse_dev *fud = __fuse_get_dev(file);
+ int err;
+
+ if (likely(fud))
+ return fud;
+
+ err = wait_event_interruptible(fuse_dev_waitq,
+ READ_ONCE(file->private_data) != FUSE_DEV_SYNC_INIT);
+ if (err)
+ return ERR_PTR(err);
+
+ fud = __fuse_get_dev(file);
+ if (!fud)
+ return ERR_PTR(-EPERM);
+
+ return fud;
+}
+
static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to)
{
struct fuse_copy_state cs;
struct file *file = iocb->ki_filp;
struct fuse_dev *fud = fuse_get_dev(file);
- if (!fud)
- return -EPERM;
+ if (IS_ERR(fud))
+ return PTR_ERR(fud);
if (!user_backed_iter(to))
return -EINVAL;
@@ -1555,8 +1595,8 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
struct fuse_copy_state cs;
struct fuse_dev *fud = fuse_get_dev(in);
- if (!fud)
- return -EPERM;
+ if (IS_ERR(fud))
+ return PTR_ERR(fud);
bufs = kvmalloc_array(pipe->max_usage, sizeof(struct pipe_buffer),
GFP_KERNEL);
@@ -1600,35 +1640,31 @@ static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size,
struct fuse_copy_state *cs)
{
struct fuse_notify_poll_wakeup_out outarg;
- int err = -EINVAL;
+ int err;
if (size != sizeof(outarg))
- goto err;
+ return -EINVAL;
err = fuse_copy_one(cs, &outarg, sizeof(outarg));
if (err)
- goto err;
+ return err;
fuse_copy_finish(cs);
return fuse_notify_poll_wakeup(fc, &outarg);
-
-err:
- fuse_copy_finish(cs);
- return err;
}
static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size,
struct fuse_copy_state *cs)
{
struct fuse_notify_inval_inode_out outarg;
- int err = -EINVAL;
+ int err;
if (size != sizeof(outarg))
- goto err;
+ return -EINVAL;
err = fuse_copy_one(cs, &outarg, sizeof(outarg));
if (err)
- goto err;
+ return err;
fuse_copy_finish(cs);
down_read(&fc->killsb);
@@ -1636,10 +1672,6 @@ static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size,
outarg.off, outarg.len);
up_read(&fc->killsb);
return err;
-
-err:
- fuse_copy_finish(cs);
- return err;
}
static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
@@ -1647,29 +1679,26 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
{
struct fuse_notify_inval_entry_out outarg;
int err;
- char *buf = NULL;
+ char *buf;
struct qstr name;
- err = -EINVAL;
if (size < sizeof(outarg))
- goto err;
+ return -EINVAL;
err = fuse_copy_one(cs, &outarg, sizeof(outarg));
if (err)
- goto err;
+ return err;
- err = -ENAMETOOLONG;
if (outarg.namelen > fc->name_max)
- goto err;
+ return -ENAMETOOLONG;
err = -EINVAL;
if (size != sizeof(outarg) + outarg.namelen + 1)
- goto err;
+ return -EINVAL;
- err = -ENOMEM;
buf = kzalloc(outarg.namelen + 1, GFP_KERNEL);
if (!buf)
- goto err;
+ return -ENOMEM;
name.name = buf;
name.len = outarg.namelen;
@@ -1682,12 +1711,8 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
down_read(&fc->killsb);
err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name, outarg.flags);
up_read(&fc->killsb);
- kfree(buf);
- return err;
-
err:
kfree(buf);
- fuse_copy_finish(cs);
return err;
}
@@ -1696,29 +1721,25 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
{
struct fuse_notify_delete_out outarg;
int err;
- char *buf = NULL;
+ char *buf;
struct qstr name;
- err = -EINVAL;
if (size < sizeof(outarg))
- goto err;
+ return -EINVAL;
err = fuse_copy_one(cs, &outarg, sizeof(outarg));
if (err)
- goto err;
+ return err;
- err = -ENAMETOOLONG;
if (outarg.namelen > fc->name_max)
- goto err;
+ return -ENAMETOOLONG;
- err = -EINVAL;
if (size != sizeof(outarg) + outarg.namelen + 1)
- goto err;
+ return -EINVAL;
- err = -ENOMEM;
buf = kzalloc(outarg.namelen + 1, GFP_KERNEL);
if (!buf)
- goto err;
+ return -ENOMEM;
name.name = buf;
name.len = outarg.namelen;
@@ -1731,12 +1752,8 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
down_read(&fc->killsb);
err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name, 0);
up_read(&fc->killsb);
- kfree(buf);
- return err;
-
err:
kfree(buf);
- fuse_copy_finish(cs);
return err;
}
@@ -1754,17 +1771,15 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
loff_t file_size;
loff_t end;
- err = -EINVAL;
if (size < sizeof(outarg))
- goto out_finish;
+ return -EINVAL;
err = fuse_copy_one(cs, &outarg, sizeof(outarg));
if (err)
- goto out_finish;
+ return err;
- err = -EINVAL;
if (size - sizeof(outarg) != outarg.size)
- goto out_finish;
+ return -EINVAL;
nodeid = outarg.nodeid;
@@ -1824,8 +1839,6 @@ out_iput:
iput(inode);
out_up_killsb:
up_read(&fc->killsb);
-out_finish:
- fuse_copy_finish(cs);
return err;
}
@@ -1940,13 +1953,12 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size,
u64 nodeid;
int err;
- err = -EINVAL;
if (size != sizeof(outarg))
- goto copy_finish;
+ return -EINVAL;
err = fuse_copy_one(cs, &outarg, sizeof(outarg));
if (err)
- goto copy_finish;
+ return err;
fuse_copy_finish(cs);
@@ -1962,10 +1974,6 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size,
up_read(&fc->killsb);
return err;
-
-copy_finish:
- fuse_copy_finish(cs);
- return err;
}
/*
@@ -2044,6 +2052,42 @@ static int fuse_notify_inc_epoch(struct fuse_conn *fc)
return 0;
}
+static int fuse_notify_prune(struct fuse_conn *fc, unsigned int size,
+ struct fuse_copy_state *cs)
+{
+ struct fuse_notify_prune_out outarg;
+ const unsigned int batch = 512;
+ u64 *nodeids __free(kfree) = kmalloc(sizeof(u64) * batch, GFP_KERNEL);
+ unsigned int num, i;
+ int err;
+
+ if (!nodeids)
+ return -ENOMEM;
+
+ if (size < sizeof(outarg))
+ return -EINVAL;
+
+ err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+ if (err)
+ return err;
+
+ if (size - sizeof(outarg) != outarg.count * sizeof(u64))
+ return -EINVAL;
+
+ for (; outarg.count; outarg.count -= num) {
+ num = min(batch, outarg.count);
+ err = fuse_copy_one(cs, nodeids, num * sizeof(u64));
+ if (err)
+ return err;
+
+ scoped_guard(rwsem_read, &fc->killsb) {
+ for (i = 0; i < num; i++)
+ fuse_try_prune_one_inode(fc, nodeids[i]);
+ }
+ }
+ return 0;
+}
+
static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
unsigned int size, struct fuse_copy_state *cs)
{
@@ -2075,8 +2119,10 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
case FUSE_NOTIFY_INC_EPOCH:
return fuse_notify_inc_epoch(fc);
+ case FUSE_NOTIFY_PRUNE:
+ return fuse_notify_prune(fc, size, cs);
+
default:
- fuse_copy_finish(cs);
return -EINVAL;
}
}
@@ -2156,7 +2202,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
*/
if (!oh.unique) {
err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs);
- goto out;
+ goto copy_finish;
}
err = -EINVAL;
@@ -2229,7 +2275,7 @@ copy_finish:
static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from)
{
struct fuse_copy_state cs;
- struct fuse_dev *fud = fuse_get_dev(iocb->ki_filp);
+ struct fuse_dev *fud = __fuse_get_dev(iocb->ki_filp);
if (!fud)
return -EPERM;
@@ -2251,11 +2297,10 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
unsigned idx;
struct pipe_buffer *bufs;
struct fuse_copy_state cs;
- struct fuse_dev *fud;
+ struct fuse_dev *fud = __fuse_get_dev(out);
size_t rem;
ssize_t ret;
- fud = fuse_get_dev(out);
if (!fud)
return -EPERM;
@@ -2341,7 +2386,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait)
struct fuse_iqueue *fiq;
struct fuse_dev *fud = fuse_get_dev(file);
- if (!fud)
+ if (IS_ERR(fud))
return EPOLLERR;
fiq = &fud->fc->iq;
@@ -2394,7 +2439,7 @@ static void end_polls(struct fuse_conn *fc)
* The same effect is usually achievable through killing the filesystem daemon
* and all users of the filesystem. The exception is the combination of an
* asynchronous request and the tricky deadlock (see
- * Documentation/filesystems/fuse.rst).
+ * Documentation/filesystems/fuse/fuse.rst).
*
* Aborting requests under I/O goes as follows: 1: Separate out unlocked
* requests, they should be finished off immediately. Locked requests will be
@@ -2488,7 +2533,7 @@ void fuse_wait_aborted(struct fuse_conn *fc)
int fuse_dev_release(struct inode *inode, struct file *file)
{
- struct fuse_dev *fud = fuse_get_dev(file);
+ struct fuse_dev *fud = __fuse_get_dev(file);
if (fud) {
struct fuse_conn *fc = fud->fc;
@@ -2519,8 +2564,8 @@ static int fuse_dev_fasync(int fd, struct file *file, int on)
{
struct fuse_dev *fud = fuse_get_dev(file);
- if (!fud)
- return -EPERM;
+ if (IS_ERR(fud))
+ return PTR_ERR(fud);
/* No locking - fasync_helper does its own locking */
return fasync_helper(fd, file, on, &fud->fc->iq.fasync);
@@ -2530,7 +2575,7 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new)
{
struct fuse_dev *fud;
- if (new->private_data)
+ if (__fuse_get_dev(new))
return -EINVAL;
fud = fuse_dev_alloc_install(fc);
@@ -2561,7 +2606,7 @@ static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp)
* uses the same ioctl handler.
*/
if (fd_file(f)->f_op == file->f_op)
- fud = fuse_get_dev(fd_file(f));
+ fud = __fuse_get_dev(fd_file(f));
res = -EINVAL;
if (fud) {
@@ -2579,8 +2624,8 @@ static long fuse_dev_ioctl_backing_open(struct file *file,
struct fuse_dev *fud = fuse_get_dev(file);
struct fuse_backing_map map;
- if (!fud)
- return -EPERM;
+ if (IS_ERR(fud))
+ return PTR_ERR(fud);
if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
return -EOPNOTSUPP;
@@ -2596,8 +2641,8 @@ static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp)
struct fuse_dev *fud = fuse_get_dev(file);
int backing_id;
- if (!fud)
- return -EPERM;
+ if (IS_ERR(fud))
+ return PTR_ERR(fud);
if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
return -EOPNOTSUPP;
@@ -2608,6 +2653,19 @@ static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp)
return fuse_backing_close(fud->fc, backing_id);
}
+static long fuse_dev_ioctl_sync_init(struct file *file)
+{
+ int err = -EINVAL;
+
+ mutex_lock(&fuse_mutex);
+ if (!__fuse_get_dev(file)) {
+ WRITE_ONCE(file->private_data, FUSE_DEV_SYNC_INIT);
+ err = 0;
+ }
+ mutex_unlock(&fuse_mutex);
+ return err;
+}
+
static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
@@ -2623,6 +2681,9 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
case FUSE_DEV_IOC_BACKING_CLOSE:
return fuse_dev_ioctl_backing_close(file, argp);
+ case FUSE_DEV_IOC_SYNC_INIT:
+ return fuse_dev_ioctl_sync_init(file);
+
default:
return -ENOTTY;
}
@@ -2631,7 +2692,7 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
#ifdef CONFIG_PROC_FS
static void fuse_dev_show_fdinfo(struct seq_file *seq, struct file *file)
{
- struct fuse_dev *fud = fuse_get_dev(file);
+ struct fuse_dev *fud = __fuse_get_dev(file);
if (!fud)
return;
diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index a30c44234a4e..f6b12aebb8bb 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -7,6 +7,7 @@
#include "fuse_i.h"
#include "dev_uring_i.h"
#include "fuse_dev_i.h"
+#include "fuse_trace.h"
#include <linux/fs.h>
#include <linux/io_uring/cmd.h>
@@ -1139,9 +1140,9 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
return -EINVAL;
fud = fuse_get_dev(cmd->file);
- if (!fud) {
+ if (IS_ERR(fud)) {
pr_info_ratelimited("No fuse device found\n");
- return -ENOTCONN;
+ return PTR_ERR(fud);
}
fc = fud->fc;
@@ -1268,8 +1269,7 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req)
if (!queue)
goto err;
- if (req->in.h.opcode != FUSE_NOTIFY_REPLY)
- req->in.h.unique = fuse_get_unique(fiq);
+ fuse_request_assign_unique(fiq, req);
spin_lock(&queue->lock);
err = -ENOTCONN;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 5c569c3cb53f..ecaec0fea3a1 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -739,22 +739,18 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
int err;
struct mnt_idmap *idmap = file_mnt_idmap(file);
struct fuse_conn *fc = get_fuse_conn(dir);
- struct dentry *res = NULL;
if (fuse_is_bad(dir))
return -EIO;
if (d_in_lookup(entry)) {
- res = fuse_lookup(dir, entry, 0);
- if (IS_ERR(res))
- return PTR_ERR(res);
-
- if (res)
- entry = res;
+ struct dentry *res = fuse_lookup(dir, entry, 0);
+ if (res || d_really_is_positive(entry))
+ return finish_no_open(file, res);
}
- if (!(flags & O_CREAT) || d_really_is_positive(entry))
- goto no_open;
+ if (!(flags & O_CREAT))
+ return finish_no_open(file, NULL);
/* Only creates */
file->f_mode |= FMODE_CREATED;
@@ -768,16 +764,13 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
goto mknod;
} else if (err == -EEXIST)
fuse_invalidate_entry(entry);
-out_dput:
- dput(res);
return err;
mknod:
err = fuse_mknod(idmap, dir, entry, mode, 0);
if (err)
- goto out_dput;
-no_open:
- return finish_no_open(file, res);
+ return err;
+ return finish_no_open(file, NULL);
}
/*
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 4adcf09d4b01..f1ef77a0be05 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -356,8 +356,14 @@ void fuse_file_release(struct inode *inode, struct fuse_file *ff,
* Make the release synchronous if this is a fuseblk mount,
* synchronous RELEASE is allowed (and desirable) in this case
* because the server can be trusted not to screw up.
+ *
+ * Always use the asynchronous file put because the current thread
+ * might be the fuse server. This can happen if a process starts some
+ * aio and closes the fd before the aio completes. Since aio takes its
+ * own ref to the file, the IO completion has to drop the ref, which is
+ * how the fuse server can end up closing its clients' files.
*/
- fuse_file_put(ff, ff->fm->fc->destroy);
+ fuse_file_put(ff, false);
}
void fuse_release_common(struct file *file, bool isdir)
@@ -865,22 +871,20 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
struct fuse_args_pages *ap = &ia->ap;
size_t count = ia->read.in.size;
size_t num_read = args->out_args[0].size;
- struct address_space *mapping = NULL;
-
- for (i = 0; mapping == NULL && i < ap->num_folios; i++)
- mapping = ap->folios[i]->mapping;
+ struct address_space *mapping;
+ struct inode *inode;
- if (mapping) {
- struct inode *inode = mapping->host;
+ WARN_ON_ONCE(!ap->num_folios);
+ mapping = ap->folios[0]->mapping;
+ inode = mapping->host;
- /*
- * Short read means EOF. If file size is larger, truncate it
- */
- if (!err && num_read < count)
- fuse_short_read(inode, ia->read.attr_ver, num_read, ap);
+ /*
+ * Short read means EOF. If file size is larger, truncate it
+ */
+ if (!err && num_read < count)
+ fuse_short_read(inode, ia->read.attr_ver, num_read, ap);
- fuse_invalidate_atime(inode);
- }
+ fuse_invalidate_atime(inode);
for (i = 0; i < ap->num_folios; i++) {
folio_end_read(ap->folios[i], !err);
@@ -1175,7 +1179,6 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
num = min(iov_iter_count(ii), fc->max_write);
ap->args.in_pages = true;
- ap->descs[0].offset = offset;
while (num && ap->num_folios < max_folios) {
size_t tmp;
@@ -1823,19 +1826,15 @@ static void fuse_writepage_finish(struct fuse_writepage_args *wpa)
struct fuse_args_pages *ap = &wpa->ia.ap;
struct inode *inode = wpa->inode;
struct fuse_inode *fi = get_fuse_inode(inode);
- struct backing_dev_info *bdi = inode_to_bdi(inode);
int i;
- for (i = 0; i < ap->num_folios; i++) {
+ for (i = 0; i < ap->num_folios; i++)
/*
* Benchmarks showed that ending writeback within the
* scope of the fi->lock alleviates xarray lock
* contention and noticeably improves performance.
*/
iomap_finish_folio_write(inode, ap->folios[i], 1);
- dec_wb_stat(&bdi->wb, WB_WRITEBACK);
- wb_writeout_inc(&bdi->wb);
- }
wake_up(&fi->page_waitq);
}
@@ -2010,14 +2009,11 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc,
static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio,
uint32_t folio_index, loff_t offset, unsigned len)
{
- struct inode *inode = folio->mapping->host;
struct fuse_args_pages *ap = &wpa->ia.ap;
ap->folios[folio_index] = folio;
ap->descs[folio_index].offset = offset;
ap->descs[folio_index].length = len;
-
- inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
}
static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio,
@@ -2960,10 +2956,12 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
.nodeid_out = ff_out->nodeid,
.fh_out = ff_out->fh,
.off_out = pos_out,
- .len = min_t(size_t, len, UINT_MAX & PAGE_MASK),
+ .len = len,
.flags = flags
};
struct fuse_write_out outarg;
+ struct fuse_copy_file_range_out outarg_64;
+ u64 bytes_copied;
ssize_t err;
/* mark unstable when write-back is not used, and file_out gets
* extended */
@@ -3013,33 +3011,51 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
if (is_unstable)
set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
- args.opcode = FUSE_COPY_FILE_RANGE;
+ args.opcode = FUSE_COPY_FILE_RANGE_64;
args.nodeid = ff_in->nodeid;
args.in_numargs = 1;
args.in_args[0].size = sizeof(inarg);
args.in_args[0].value = &inarg;
args.out_numargs = 1;
- args.out_args[0].size = sizeof(outarg);
- args.out_args[0].value = &outarg;
+ args.out_args[0].size = sizeof(outarg_64);
+ args.out_args[0].value = &outarg_64;
+ if (fc->no_copy_file_range_64) {
+fallback:
+ /* Fall back to old op that can't handle large copy length */
+ args.opcode = FUSE_COPY_FILE_RANGE;
+ args.out_args[0].size = sizeof(outarg);
+ args.out_args[0].value = &outarg;
+ inarg.len = len = min_t(size_t, len, UINT_MAX & PAGE_MASK);
+ }
err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
- fc->no_copy_file_range = 1;
- err = -EOPNOTSUPP;
+ if (fc->no_copy_file_range_64) {
+ fc->no_copy_file_range = 1;
+ err = -EOPNOTSUPP;
+ } else {
+ fc->no_copy_file_range_64 = 1;
+ goto fallback;
+ }
}
- if (!err && outarg.size > len)
- err = -EIO;
-
if (err)
goto out;
+ bytes_copied = fc->no_copy_file_range_64 ?
+ outarg.size : outarg_64.bytes_copied;
+
+ if (bytes_copied > len) {
+ err = -EIO;
+ goto out;
+ }
+
truncate_inode_pages_range(inode_out->i_mapping,
ALIGN_DOWN(pos_out, PAGE_SIZE),
- ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1);
+ ALIGN(pos_out + bytes_copied, PAGE_SIZE) - 1);
file_update_time(file_out);
- fuse_write_update_attr(inode_out, pos_out + outarg.size, outarg.size);
+ fuse_write_update_attr(inode_out, pos_out + bytes_copied, bytes_copied);
- err = outarg.size;
+ err = bytes_copied;
out:
if (is_unstable)
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h
index 5a9bd771a319..6e8373f97040 100644
--- a/fs/fuse/fuse_dev_i.h
+++ b/fs/fuse/fuse_dev_i.h
@@ -12,6 +12,8 @@
#define FUSE_INT_REQ_BIT (1ULL << 0)
#define FUSE_REQ_ID_STEP (1ULL << 1)
+extern struct wait_queue_head fuse_dev_waitq;
+
struct fuse_arg;
struct fuse_args;
struct fuse_pqueue;
@@ -37,15 +39,22 @@ struct fuse_copy_state {
} ring;
};
-static inline struct fuse_dev *fuse_get_dev(struct file *file)
+#define FUSE_DEV_SYNC_INIT ((struct fuse_dev *) 1)
+#define FUSE_DEV_PTR_MASK (~1UL)
+
+static inline struct fuse_dev *__fuse_get_dev(struct file *file)
{
/*
* Lockless access is OK, because file->private data is set
* once during mount and is valid until the file is released.
*/
- return READ_ONCE(file->private_data);
+ struct fuse_dev *fud = READ_ONCE(file->private_data);
+
+ return (typeof(fud)) ((unsigned long) fud & FUSE_DEV_PTR_MASK);
}
+struct fuse_dev *fuse_get_dev(struct file *file);
+
unsigned int fuse_req_hash(u64 unique);
struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index cc428d04be3e..c2f2a48156d6 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -856,6 +856,9 @@ struct fuse_conn {
/** Does the filesystem support copy_file_range? */
unsigned no_copy_file_range:1;
+ /** Does the filesystem support copy_file_range_64? */
+ unsigned no_copy_file_range_64:1;
+
/* Send DESTROY request */
unsigned int destroy:1;
@@ -901,6 +904,9 @@ struct fuse_conn {
/* Is link not implemented by fs? */
unsigned int no_link:1;
+ /* Is synchronous FUSE_INIT allowed? */
+ unsigned int sync_init:1;
+
/* Use io_uring for communication */
unsigned int io_uring;
@@ -1255,6 +1261,11 @@ int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args,
gfp_t gfp_flags);
/**
+ * Assign a unique id to a fuse request
+ */
+void fuse_request_assign_unique(struct fuse_iqueue *fiq, struct fuse_req *req);
+
+/**
* End a finished request
*/
void fuse_request_end(struct fuse_req *req);
@@ -1315,7 +1326,7 @@ struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc);
struct fuse_dev *fuse_dev_alloc(void);
void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc);
void fuse_dev_free(struct fuse_dev *fud);
-void fuse_send_init(struct fuse_mount *fm);
+int fuse_send_init(struct fuse_mount *fm);
/**
* Fill in superblock and initialize fuse connection
@@ -1407,6 +1418,12 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid,
int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
u64 child_nodeid, struct qstr *name, u32 flags);
+/*
+ * Try to prune this inode. If neither the inode itself nor dentries associated
+ * with this inode have any external reference, then the inode can be freed.
+ */
+void fuse_try_prune_one_inode(struct fuse_conn *fc, u64 nodeid);
+
int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
bool isdir);
@@ -1512,29 +1529,11 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
void fuse_file_release(struct inode *inode, struct fuse_file *ff,
unsigned int open_flags, fl_owner_t id, bool isdir);
-/* passthrough.c */
-static inline struct fuse_backing *fuse_inode_backing(struct fuse_inode *fi)
-{
-#ifdef CONFIG_FUSE_PASSTHROUGH
- return READ_ONCE(fi->fb);
-#else
- return NULL;
-#endif
-}
-
-static inline struct fuse_backing *fuse_inode_backing_set(struct fuse_inode *fi,
- struct fuse_backing *fb)
-{
-#ifdef CONFIG_FUSE_PASSTHROUGH
- return xchg(&fi->fb, fb);
-#else
- return NULL;
-#endif
-}
-
+/* backing.c */
#ifdef CONFIG_FUSE_PASSTHROUGH
struct fuse_backing *fuse_backing_get(struct fuse_backing *fb);
void fuse_backing_put(struct fuse_backing *fb);
+struct fuse_backing *fuse_backing_lookup(struct fuse_conn *fc, int backing_id);
#else
static inline struct fuse_backing *fuse_backing_get(struct fuse_backing *fb)
@@ -1545,6 +1544,11 @@ static inline struct fuse_backing *fuse_backing_get(struct fuse_backing *fb)
static inline void fuse_backing_put(struct fuse_backing *fb)
{
}
+static inline struct fuse_backing *fuse_backing_lookup(struct fuse_conn *fc,
+ int backing_id)
+{
+ return NULL;
+}
#endif
void fuse_backing_files_init(struct fuse_conn *fc);
@@ -1552,9 +1556,27 @@ void fuse_backing_files_free(struct fuse_conn *fc);
int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map);
int fuse_backing_close(struct fuse_conn *fc, int backing_id);
-struct fuse_backing *fuse_passthrough_open(struct file *file,
- struct inode *inode,
- int backing_id);
+/* passthrough.c */
+static inline struct fuse_backing *fuse_inode_backing(struct fuse_inode *fi)
+{
+#ifdef CONFIG_FUSE_PASSTHROUGH
+ return READ_ONCE(fi->fb);
+#else
+ return NULL;
+#endif
+}
+
+static inline struct fuse_backing *fuse_inode_backing_set(struct fuse_inode *fi,
+ struct fuse_backing *fb)
+{
+#ifdef CONFIG_FUSE_PASSTHROUGH
+ return xchg(&fi->fb, fb);
+#else
+ return NULL;
+#endif
+}
+
+struct fuse_backing *fuse_passthrough_open(struct file *file, int backing_id);
void fuse_passthrough_release(struct fuse_file *ff, struct fuse_backing *fb);
static inline struct file *fuse_file_passthrough(struct fuse_file *ff)
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 7485a41af892..d1babf56f254 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -7,6 +7,7 @@
*/
#include "fuse_i.h"
+#include "fuse_dev_i.h"
#include "dev_uring_i.h"
#include <linux/dax.h>
@@ -34,6 +35,7 @@ MODULE_LICENSE("GPL");
static struct kmem_cache *fuse_inode_cachep;
struct list_head fuse_conn_list;
DEFINE_MUTEX(fuse_mutex);
+DECLARE_WAIT_QUEUE_HEAD(fuse_dev_waitq);
static int set_global_limit(const char *val, const struct kernel_param *kp);
@@ -101,14 +103,11 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
if (!fi)
return NULL;
- fi->i_time = 0;
+ /* Initialize private data (i.e. everything except fi->inode) */
+ BUILD_BUG_ON(offsetof(struct fuse_inode, inode) != 0);
+ memset((void *) fi + sizeof(fi->inode), 0, sizeof(*fi) - sizeof(fi->inode));
+
fi->inval_mask = ~0;
- fi->nodeid = 0;
- fi->nlookup = 0;
- fi->attr_version = 0;
- fi->orig_ino = 0;
- fi->state = 0;
- fi->submount_lookup = NULL;
mutex_init(&fi->mutex);
spin_lock_init(&fi->lock);
fi->forget = fuse_alloc_forget();
@@ -586,6 +585,17 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid,
return 0;
}
+void fuse_try_prune_one_inode(struct fuse_conn *fc, u64 nodeid)
+{
+ struct inode *inode;
+
+ inode = fuse_ilookup(fc, nodeid, NULL);
+ if (!inode)
+ return;
+ d_prune_aliases(inode);
+ iput(inode);
+}
+
bool fuse_lock_inode(struct inode *inode)
{
bool locked = false;
@@ -1469,7 +1479,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
wake_up_all(&fc->blocked_waitq);
}
-void fuse_send_init(struct fuse_mount *fm)
+static struct fuse_init_args *fuse_new_init(struct fuse_mount *fm)
{
struct fuse_init_args *ia;
u64 flags;
@@ -1528,10 +1538,30 @@ void fuse_send_init(struct fuse_mount *fm)
ia->args.out_args[0].value = &ia->out;
ia->args.force = true;
ia->args.nocreds = true;
- ia->args.end = process_init_reply;
- if (fuse_simple_background(fm, &ia->args, GFP_KERNEL) != 0)
- process_init_reply(fm, &ia->args, -ENOTCONN);
+ return ia;
+}
+
+int fuse_send_init(struct fuse_mount *fm)
+{
+ struct fuse_init_args *ia = fuse_new_init(fm);
+ int err;
+
+ if (fm->fc->sync_init) {
+ err = fuse_simple_request(fm, &ia->args);
+ /* Ignore size of init reply */
+ if (err > 0)
+ err = 0;
+ } else {
+ ia->args.end = process_init_reply;
+ err = fuse_simple_background(fm, &ia->args, GFP_KERNEL);
+ if (!err)
+ return 0;
+ }
+ process_init_reply(fm, &ia->args, err);
+ if (fm->fc->conn_error)
+ return -ENOTCONN;
+ return 0;
}
EXPORT_SYMBOL_GPL(fuse_send_init);
@@ -1561,8 +1591,6 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
if (err)
return err;
- /* fuse does it's own writeback accounting */
- sb->s_bdi->capabilities &= ~BDI_CAP_WRITEBACK_ACCT;
sb->s_bdi->capabilities |= BDI_CAP_STRICTLIMIT;
/*
@@ -1821,6 +1849,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
!sb_set_blocksize(sb, PAGE_SIZE))
goto err;
#endif
+ fc->sync_fs = 1;
} else {
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
@@ -1872,8 +1901,12 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
mutex_lock(&fuse_mutex);
err = -EINVAL;
- if (ctx->fudptr && *ctx->fudptr)
- goto err_unlock;
+ if (ctx->fudptr && *ctx->fudptr) {
+ if (*ctx->fudptr == FUSE_DEV_SYNC_INIT)
+ fc->sync_init = 1;
+ else
+ goto err_unlock;
+ }
err = fuse_ctl_add_conn(fc);
if (err)
@@ -1881,8 +1914,10 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
list_add_tail(&fc->entry, &fuse_conn_list);
sb->s_root = root_dentry;
- if (ctx->fudptr)
+ if (ctx->fudptr) {
*ctx->fudptr = fud;
+ wake_up_all(&fuse_dev_waitq);
+ }
mutex_unlock(&fuse_mutex);
return 0;
@@ -1903,6 +1938,7 @@ EXPORT_SYMBOL_GPL(fuse_fill_super_common);
static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
{
struct fuse_fs_context *ctx = fsc->fs_private;
+ struct fuse_mount *fm;
int err;
if (!ctx->file || !ctx->rootmode_present ||
@@ -1923,8 +1959,10 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
return err;
/* file->private_data shall be visible on all CPUs after this */
smp_mb();
- fuse_send_init(get_fuse_mount_super(sb));
- return 0;
+
+ fm = get_fuse_mount_super(sb);
+
+ return fuse_send_init(fm);
}
/*
@@ -1985,7 +2023,7 @@ static int fuse_get_tree(struct fs_context *fsc)
* Allow creating a fuse mount with an already initialized fuse
* connection
*/
- fud = READ_ONCE(ctx->file->private_data);
+ fud = __fuse_get_dev(ctx->file);
if (ctx->file->f_op == &fuse_dev_operations && fud) {
fsc->sget_key = fud->fc;
sb = sget_fc(fsc, fuse_test_super, fuse_set_no_super);
diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c
index c99e285f3183..3728933188f3 100644
--- a/fs/fuse/iomode.c
+++ b/fs/fuse/iomode.c
@@ -177,8 +177,7 @@ static int fuse_file_passthrough_open(struct inode *inode, struct file *file)
(ff->open_flags & ~FOPEN_PASSTHROUGH_MASK))
return -EINVAL;
- fb = fuse_passthrough_open(file, inode,
- ff->args->open_outarg.backing_id);
+ fb = fuse_passthrough_open(file, ff->args->open_outarg.backing_id);
if (IS_ERR(fb))
return PTR_ERR(fb);
diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c
index eb97ac009e75..72de97c03d0e 100644
--- a/fs/fuse/passthrough.c
+++ b/fs/fuse/passthrough.c
@@ -144,171 +144,12 @@ ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma)
return backing_file_mmap(backing_file, vma, &ctx);
}
-struct fuse_backing *fuse_backing_get(struct fuse_backing *fb)
-{
- if (fb && refcount_inc_not_zero(&fb->count))
- return fb;
- return NULL;
-}
-
-static void fuse_backing_free(struct fuse_backing *fb)
-{
- pr_debug("%s: fb=0x%p\n", __func__, fb);
-
- if (fb->file)
- fput(fb->file);
- put_cred(fb->cred);
- kfree_rcu(fb, rcu);
-}
-
-void fuse_backing_put(struct fuse_backing *fb)
-{
- if (fb && refcount_dec_and_test(&fb->count))
- fuse_backing_free(fb);
-}
-
-void fuse_backing_files_init(struct fuse_conn *fc)
-{
- idr_init(&fc->backing_files_map);
-}
-
-static int fuse_backing_id_alloc(struct fuse_conn *fc, struct fuse_backing *fb)
-{
- int id;
-
- idr_preload(GFP_KERNEL);
- spin_lock(&fc->lock);
- /* FIXME: xarray might be space inefficient */
- id = idr_alloc_cyclic(&fc->backing_files_map, fb, 1, 0, GFP_ATOMIC);
- spin_unlock(&fc->lock);
- idr_preload_end();
-
- WARN_ON_ONCE(id == 0);
- return id;
-}
-
-static struct fuse_backing *fuse_backing_id_remove(struct fuse_conn *fc,
- int id)
-{
- struct fuse_backing *fb;
-
- spin_lock(&fc->lock);
- fb = idr_remove(&fc->backing_files_map, id);
- spin_unlock(&fc->lock);
-
- return fb;
-}
-
-static int fuse_backing_id_free(int id, void *p, void *data)
-{
- struct fuse_backing *fb = p;
-
- WARN_ON_ONCE(refcount_read(&fb->count) != 1);
- fuse_backing_free(fb);
- return 0;
-}
-
-void fuse_backing_files_free(struct fuse_conn *fc)
-{
- idr_for_each(&fc->backing_files_map, fuse_backing_id_free, NULL);
- idr_destroy(&fc->backing_files_map);
-}
-
-int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map)
-{
- struct file *file;
- struct super_block *backing_sb;
- struct fuse_backing *fb = NULL;
- int res;
-
- pr_debug("%s: fd=%d flags=0x%x\n", __func__, map->fd, map->flags);
-
- /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */
- res = -EPERM;
- if (!fc->passthrough || !capable(CAP_SYS_ADMIN))
- goto out;
-
- res = -EINVAL;
- if (map->flags || map->padding)
- goto out;
-
- file = fget_raw(map->fd);
- res = -EBADF;
- if (!file)
- goto out;
-
- /* read/write/splice/mmap passthrough only relevant for regular files */
- res = d_is_dir(file->f_path.dentry) ? -EISDIR : -EINVAL;
- if (!d_is_reg(file->f_path.dentry))
- goto out_fput;
-
- backing_sb = file_inode(file)->i_sb;
- res = -ELOOP;
- if (backing_sb->s_stack_depth >= fc->max_stack_depth)
- goto out_fput;
-
- fb = kmalloc(sizeof(struct fuse_backing), GFP_KERNEL);
- res = -ENOMEM;
- if (!fb)
- goto out_fput;
-
- fb->file = file;
- fb->cred = prepare_creds();
- refcount_set(&fb->count, 1);
-
- res = fuse_backing_id_alloc(fc, fb);
- if (res < 0) {
- fuse_backing_free(fb);
- fb = NULL;
- }
-
-out:
- pr_debug("%s: fb=0x%p, ret=%i\n", __func__, fb, res);
-
- return res;
-
-out_fput:
- fput(file);
- goto out;
-}
-
-int fuse_backing_close(struct fuse_conn *fc, int backing_id)
-{
- struct fuse_backing *fb = NULL;
- int err;
-
- pr_debug("%s: backing_id=%d\n", __func__, backing_id);
-
- /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */
- err = -EPERM;
- if (!fc->passthrough || !capable(CAP_SYS_ADMIN))
- goto out;
-
- err = -EINVAL;
- if (backing_id <= 0)
- goto out;
-
- err = -ENOENT;
- fb = fuse_backing_id_remove(fc, backing_id);
- if (!fb)
- goto out;
-
- fuse_backing_put(fb);
- err = 0;
-out:
- pr_debug("%s: fb=0x%p, err=%i\n", __func__, fb, err);
-
- return err;
-}
-
/*
* Setup passthrough to a backing file.
*
* Returns an fb object with elevated refcount to be stored in fuse inode.
*/
-struct fuse_backing *fuse_passthrough_open(struct file *file,
- struct inode *inode,
- int backing_id)
+struct fuse_backing *fuse_passthrough_open(struct file *file, int backing_id)
{
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = ff->fm->fc;
@@ -320,12 +161,8 @@ struct fuse_backing *fuse_passthrough_open(struct file *file,
if (backing_id <= 0)
goto out;
- rcu_read_lock();
- fb = idr_find(&fc->backing_files_map, backing_id);
- fb = fuse_backing_get(fb);
- rcu_read_unlock();
-
err = -ENOENT;
+ fb = fuse_backing_lookup(fc, backing_id);
if (!fb)
goto out;
diff --git a/fs/fuse/trace.c b/fs/fuse/trace.c
new file mode 100644
index 000000000000..93bd72efc98c
--- /dev/null
+++ b/fs/fuse/trace.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2025 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "dev_uring_i.h"
+#include "fuse_i.h"
+#include "fuse_dev_i.h"
+
+#include <linux/pagemap.h>
+
+#define CREATE_TRACE_POINTS
+#include "fuse_trace.h"
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 76c8fd0bfc75..6bc7c97b017d 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -20,6 +20,7 @@
#include <linux/cleanup.h>
#include <linux/uio.h>
#include "fuse_i.h"
+#include "fuse_dev_i.h"
/* Used to help calculate the FUSE connection's max_pages limit for a request's
* size. Parts of the struct fuse_req are sliced into scattergather lists in
@@ -761,7 +762,6 @@ static void copy_args_from_argbuf(struct fuse_args *args, struct fuse_req *req)
static void virtio_fs_request_complete(struct fuse_req *req,
struct virtio_fs_vq *fsvq)
{
- struct fuse_pqueue *fpq = &fsvq->fud->pq;
struct fuse_args *args;
struct fuse_args_pages *ap;
unsigned int len, i, thislen;
@@ -790,9 +790,7 @@ static void virtio_fs_request_complete(struct fuse_req *req,
}
}
- spin_lock(&fpq->lock);
clear_bit(FR_SENT, &req->flags);
- spin_unlock(&fpq->lock);
fuse_request_end(req);
spin_lock(&fsvq->lock);
@@ -1384,7 +1382,7 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
unsigned int out_sgs = 0;
unsigned int in_sgs = 0;
unsigned int total_sgs;
- unsigned int i;
+ unsigned int i, hash;
int ret;
bool notify;
struct fuse_pqueue *fpq;
@@ -1444,8 +1442,9 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
/* Request successfully sent. */
fpq = &fsvq->fud->pq;
+ hash = fuse_req_hash(req->in.h.unique);
spin_lock(&fpq->lock);
- list_add_tail(&req->list, fpq->processing);
+ list_add_tail(&req->list, &fpq->processing[hash]);
spin_unlock(&fpq->lock);
set_bit(FR_SENT, &req->flags);
/* matches barrier in request_wait_answer() */
@@ -1480,8 +1479,7 @@ static void virtio_fs_send_req(struct fuse_iqueue *fiq, struct fuse_req *req)
struct virtio_fs_vq *fsvq;
int ret;
- if (req->in.h.opcode != FUSE_NOTIFY_REPLY)
- req->in.h.unique = fuse_get_unique(fiq);
+ fuse_request_assign_unique(fiq, req);
clear_bit(FR_PENDING, &req->flags);
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 8760e7e20c9d..8a7ed80d9f2d 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1368,27 +1368,19 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
struct file *file, unsigned flags,
umode_t mode)
{
- struct dentry *d;
bool excl = !!(flags & O_EXCL);
- if (!d_in_lookup(dentry))
- goto skip_lookup;
-
- d = __gfs2_lookup(dir, dentry, file);
- if (IS_ERR(d))
- return PTR_ERR(d);
- if (d != NULL)
- dentry = d;
- if (d_really_is_positive(dentry)) {
- if (!(file->f_mode & FMODE_OPENED))
+ if (d_in_lookup(dentry)) {
+ struct dentry *d = __gfs2_lookup(dir, dentry, file);
+ if (file->f_mode & FMODE_OPENED) {
+ if (IS_ERR(d))
+ return PTR_ERR(d);
+ dput(d);
+ return excl && (flags & O_CREAT) ? -EEXIST : 0;
+ }
+ if (d || d_really_is_positive(dentry))
return finish_no_open(file, d);
- dput(d);
- return excl && (flags & O_CREAT) ? -EEXIST : 0;
}
-
- BUG_ON(d != NULL);
-
-skip_lookup:
if (!(flags & O_CREAT))
return -ENOENT;
diff --git a/fs/internal.h b/fs/internal.h
index a33d18ee5b74..9b2b4d116880 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -53,7 +53,7 @@ extern int finish_clean_context(struct fs_context *fc);
* namei.c
*/
extern int filename_lookup(int dfd, struct filename *name, unsigned flags,
- struct path *path, struct path *root);
+ struct path *path, const struct path *root);
int do_rmdir(int dfd, struct filename *name);
int do_unlinkat(int dfd, struct filename *name);
int may_linkat(struct mnt_idmap *idmap, const struct path *link);
@@ -84,9 +84,9 @@ void mnt_put_write_access_file(struct file *file);
extern void dissolve_on_fput(struct vfsmount *);
extern bool may_mount(void);
-int path_mount(const char *dev_name, struct path *path,
+int path_mount(const char *dev_name, const struct path *path,
const char *type_page, unsigned long flags, void *data_page);
-int path_umount(struct path *path, int flags);
+int path_umount(const struct path *path, int flags);
int show_path(struct seq_file *m, struct dentry *root);
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 38861ca04899..2d0719bf6d87 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -131,7 +131,7 @@ __flush_batch(journal_t *journal, int *batch_count)
blk_start_plug(&plug);
for (i = 0; i < *batch_count; i++)
- write_dirty_buffer(journal->j_chkpt_bhs[i], REQ_SYNC);
+ write_dirty_buffer(journal->j_chkpt_bhs[i], JBD2_JOURNAL_REQ_FLAGS);
blk_finish_plug(&plug);
for (i = 0; i < *batch_count; i++) {
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index fcedeb514e14..21f3d029da7d 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -59,9 +59,15 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino)
*/
inode->i_link[inode->i_size] = '\0';
}
- } else {
+ } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+ S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
inode->i_op = &jfs_file_inode_operations;
init_special_inode(inode, inode->i_mode, inode->i_rdev);
+ } else {
+ printk(KERN_DEBUG "JFS: Invalid file type 0%04o for inode %lu.\n",
+ inode->i_mode, inode->i_ino);
+ iget_failed(inode);
+ return ERR_PTR(-EIO);
}
unlock_new_inode(inode);
return inode;
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index ab11849cf9cc..0ab83bb7bbdf 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -2903,7 +2903,7 @@ int jfs_readdir(struct file *file, struct dir_context *ctx)
stbl = DT_GETSTBL(p);
for (i = index; i < p->header.nextindex; i++) {
- if (stbl[i] < 0 || stbl[i] > 127) {
+ if (stbl[i] < 0 || stbl[i] >= DTPAGEMAXSLOT) {
jfs_err("JFS: Invalid stbl[%d] = %d for inode %ld, block = %lld",
i, stbl[i], (long)ip->i_ino, (long long)bn);
free_page(dirent_buf);
@@ -3108,7 +3108,7 @@ static int dtReadFirst(struct inode *ip, struct btstack * btstack)
/* get the leftmost entry */
stbl = DT_GETSTBL(p);
- if (stbl[0] < 0 || stbl[0] > 127) {
+ if (stbl[0] < 0 || stbl[0] >= DTPAGEMAXSLOT) {
DT_PUTPAGE(mp);
jfs_error(ip->i_sb, "stbl[0] out of bound\n");
return -EIO;
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 270808b6219b..b343c5ea1159 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1199,7 +1199,6 @@ static int open_dummy_log(struct super_block *sb)
init_waitqueue_head(&dummy_log->syncwait);
dummy_log->no_integrity = 1;
/* Make up some stuff */
- dummy_log->base = 0;
dummy_log->size = 1024;
rc = lmLogInit(dummy_log);
if (rc) {
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 98f9a432c336..52e6b58c5dbd 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -325,13 +325,13 @@ static int chkSuper(struct super_block *sb)
if ((j_sb->s_flag & cpu_to_le32(JFS_BAD_SAIT)) !=
cpu_to_le32(JFS_BAD_SAIT)) {
expected_AIM_bytesize = 2 * PSIZE;
- AIM_bytesize = lengthPXD(&(j_sb->s_aim2)) * bsize;
+ AIM_bytesize = lengthPXD(&j_sb->s_aim2) * bsize;
expected_AIT_bytesize = 4 * PSIZE;
- AIT_bytesize = lengthPXD(&(j_sb->s_ait2)) * bsize;
- AIM_byte_addr = addressPXD(&(j_sb->s_aim2)) * bsize;
- AIT_byte_addr = addressPXD(&(j_sb->s_ait2)) * bsize;
+ AIT_bytesize = lengthPXD(&j_sb->s_ait2) * bsize;
+ AIM_byte_addr = addressPXD(&j_sb->s_aim2) * bsize;
+ AIT_byte_addr = addressPXD(&j_sb->s_ait2) * bsize;
byte_addr_diff0 = AIT_byte_addr - AIM_byte_addr;
- fsckwsp_addr = addressPXD(&(j_sb->s_fsckpxd)) * bsize;
+ fsckwsp_addr = addressPXD(&j_sb->s_fsckpxd) * bsize;
byte_addr_diff1 = fsckwsp_addr - AIT_byte_addr;
if ((AIM_bytesize != expected_AIM_bytesize) ||
(AIT_bytesize != expected_AIT_bytesize) ||
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index be17e3c43582..7840a03e5bcb 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -272,14 +272,15 @@ int txInit(void)
if (TxBlock == NULL)
return -ENOMEM;
- for (k = 1; k < nTxBlock - 1; k++) {
- TxBlock[k].next = k + 1;
+ for (k = 0; k < nTxBlock; k++) {
init_waitqueue_head(&TxBlock[k].gcwait);
init_waitqueue_head(&TxBlock[k].waitor);
}
+
+ for (k = 1; k < nTxBlock - 1; k++) {
+ TxBlock[k].next = k + 1;
+ }
TxBlock[k].next = 0;
- init_waitqueue_head(&TxBlock[k].gcwait);
- init_waitqueue_head(&TxBlock[k].waitor);
TxAnchor.freetid = 1;
init_waitqueue_head(&TxAnchor.freewait);
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index e80262a51884..d68afa196535 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -216,8 +216,7 @@ out_err:
if (warned++ == 0)
printk(KERN_WARNING
"lockd_up: makesock failed, error=%d\n", err);
- svc_xprt_destroy_all(serv, net);
- svc_rpcb_cleanup(serv, net);
+ svc_xprt_destroy_all(serv, net, true);
return err;
}
@@ -255,8 +254,7 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net)
nlm_shutdown_hosts_net(net);
cancel_delayed_work_sync(&ln->grace_period_end);
locks_end_grace(&ln->lockd_manager);
- svc_xprt_destroy_all(serv, net);
- svc_rpcb_cleanup(serv, net);
+ svc_xprt_destroy_all(serv, net, true);
}
} else {
pr_err("%s: no users! net=%x\n",
diff --git a/fs/mount.h b/fs/mount.h
index 79c85639a7ba..f13a28752d0b 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -58,7 +58,10 @@ struct mount {
#endif
struct list_head mnt_mounts; /* list of children, anchored here */
struct list_head mnt_child; /* and going through their mnt_child */
- struct list_head mnt_instance; /* mount instance on sb->s_mounts */
+ struct mount *mnt_next_for_sb; /* the next two fields are hlist_node, */
+ struct mount * __aligned(1) *mnt_pprev_for_sb;
+ /* except that LSB of pprev is stolen */
+#define WRITE_HOLD 1 /* ... for use by mnt_hold_writers() */
const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */
struct list_head mnt_list;
struct list_head mnt_expire; /* link in fs-specific expiry list */
@@ -148,6 +151,11 @@ static inline void get_mnt_ns(struct mnt_namespace *ns)
extern seqlock_t mount_lock;
+DEFINE_LOCK_GUARD_0(mount_writer, write_seqlock(&mount_lock),
+ write_sequnlock(&mount_lock))
+DEFINE_LOCK_GUARD_0(mount_locked_reader, read_seqlock_excl(&mount_lock),
+ read_sequnlock_excl(&mount_lock))
+
struct proc_mounts {
struct mnt_namespace *ns;
struct path root;
@@ -224,4 +232,33 @@ static inline void mnt_notify_add(struct mount *m)
}
#endif
+static inline struct mount *topmost_overmount(struct mount *m)
+{
+ while (m->overmount)
+ m = m->overmount;
+ return m;
+}
+
+static inline bool __test_write_hold(struct mount * __aligned(1) *val)
+{
+ return (unsigned long)val & WRITE_HOLD;
+}
+
+static inline bool test_write_hold(const struct mount *m)
+{
+ return __test_write_hold(m->mnt_pprev_for_sb);
+}
+
+static inline void set_write_hold(struct mount *m)
+{
+ m->mnt_pprev_for_sb = (void *)((unsigned long)m->mnt_pprev_for_sb
+ | WRITE_HOLD);
+}
+
+static inline void clear_write_hold(struct mount *m)
+{
+ m->mnt_pprev_for_sb = (void *)((unsigned long)m->mnt_pprev_for_sb
+ & ~WRITE_HOLD);
+}
+
struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry);
diff --git a/fs/namei.c b/fs/namei.c
index 507ca0d7878d..7377020a2cba 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2695,7 +2695,7 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path
}
int filename_lookup(int dfd, struct filename *name, unsigned flags,
- struct path *path, struct path *root)
+ struct path *path, const struct path *root)
{
int retval;
struct nameidata nd;
@@ -3651,8 +3651,8 @@ static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry,
if (nd->flags & LOOKUP_DIRECTORY)
open_flag |= O_DIRECTORY;
- file->f_path.dentry = DENTRY_NOT_SET;
- file->f_path.mnt = nd->path.mnt;
+ file->__f_path.dentry = DENTRY_NOT_SET;
+ file->__f_path.mnt = nd->path.mnt;
error = dir->i_op->atomic_open(dir, dentry, file,
open_to_namei_flags(open_flag), mode);
d_lookup_done(dentry);
@@ -4020,8 +4020,8 @@ int vfs_tmpfile(struct mnt_idmap *idmap,
child = d_alloc(parentpath->dentry, &slash_name);
if (unlikely(!child))
return -ENOMEM;
- file->f_path.mnt = parentpath->mnt;
- file->f_path.dentry = child;
+ file->__f_path.mnt = parentpath->mnt;
+ file->__f_path.dentry = child;
mode = vfs_prepare_mode(idmap, dir, mode, mode, mode);
error = dir->i_op->tmpfile(idmap, dir, file, mode);
dput(child);
@@ -4256,7 +4256,7 @@ struct dentry *start_creating_path(int dfd, const char *pathname,
}
EXPORT_SYMBOL(start_creating_path);
-void end_creating_path(struct path *path, struct dentry *dentry)
+void end_creating_path(const struct path *path, struct dentry *dentry)
{
if (!IS_ERR(dentry))
dput(dentry);
diff --git a/fs/namespace.c b/fs/namespace.c
index dc01b14c58cd..d82910f33dc4 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -91,6 +91,14 @@ static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
static struct mnt_namespace *emptied_ns; /* protected by namespace_sem */
+static inline void namespace_lock(void);
+static void namespace_unlock(void);
+DEFINE_LOCK_GUARD_0(namespace_excl, namespace_lock(), namespace_unlock())
+DEFINE_LOCK_GUARD_0(namespace_shared, down_read(&namespace_sem),
+ up_read(&namespace_sem))
+
+DEFINE_FREE(mntput, struct vfsmount *, if (!IS_ERR(_T)) mntput(_T))
+
#ifdef CONFIG_FSNOTIFY
LIST_HEAD(notify_list); /* protected by namespace_sem */
#endif
@@ -363,7 +371,7 @@ out_free_cache:
* mnt_want/drop_write() will _keep_ the filesystem
* r/w.
*/
-bool __mnt_is_readonly(struct vfsmount *mnt)
+bool __mnt_is_readonly(const struct vfsmount *mnt)
{
return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb);
}
@@ -403,7 +411,7 @@ static unsigned int mnt_get_writers(struct mount *mnt)
#endif
}
-static int mnt_is_readonly(struct vfsmount *mnt)
+static int mnt_is_readonly(const struct vfsmount *mnt)
{
if (READ_ONCE(mnt->mnt_sb->s_readonly_remount))
return 1;
@@ -444,31 +452,31 @@ int mnt_get_write_access(struct vfsmount *m)
mnt_inc_writers(mnt);
/*
* The store to mnt_inc_writers must be visible before we pass
- * MNT_WRITE_HOLD loop below, so that the slowpath can see our
- * incremented count after it has set MNT_WRITE_HOLD.
+ * WRITE_HOLD loop below, so that the slowpath can see our
+ * incremented count after it has set WRITE_HOLD.
*/
smp_mb();
might_lock(&mount_lock.lock);
- while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
+ while (__test_write_hold(READ_ONCE(mnt->mnt_pprev_for_sb))) {
if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
cpu_relax();
} else {
/*
* This prevents priority inversion, if the task
- * setting MNT_WRITE_HOLD got preempted on a remote
+ * setting WRITE_HOLD got preempted on a remote
* CPU, and it prevents life lock if the task setting
- * MNT_WRITE_HOLD has a lower priority and is bound to
+ * WRITE_HOLD has a lower priority and is bound to
* the same CPU as the task that is spinning here.
*/
preempt_enable();
- lock_mount_hash();
- unlock_mount_hash();
+ read_seqlock_excl(&mount_lock);
+ read_sequnlock_excl(&mount_lock);
preempt_disable();
}
}
/*
* The barrier pairs with the barrier sb_start_ro_state_change() making
- * sure that if we see MNT_WRITE_HOLD cleared, we will also see
+ * sure that if we see WRITE_HOLD cleared, we will also see
* s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in
* mnt_is_readonly() and bail in case we are racing with remount
* read-only.
@@ -606,16 +614,16 @@ EXPORT_SYMBOL(mnt_drop_write_file);
* a call to mnt_unhold_writers() in order to stop preventing write access to
* @mnt.
*
- * Context: This function expects lock_mount_hash() to be held serializing
- * setting MNT_WRITE_HOLD.
+ * Context: This function expects to be in mount_locked_reader scope serializing
+ * setting WRITE_HOLD.
* Return: On success 0 is returned.
* On error, -EBUSY is returned.
*/
static inline int mnt_hold_writers(struct mount *mnt)
{
- mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
+ set_write_hold(mnt);
/*
- * After storing MNT_WRITE_HOLD, we'll read the counters. This store
+ * After storing WRITE_HOLD, we'll read the counters. This store
* should be visible before we do.
*/
smp_mb();
@@ -631,9 +639,9 @@ static inline int mnt_hold_writers(struct mount *mnt)
* sum up each counter, if we read a counter before it is incremented,
* but then read another CPU's count which it has been subsequently
* decremented from -- we would see more decrements than we should.
- * MNT_WRITE_HOLD protects against this scenario, because
+ * WRITE_HOLD protects against this scenario, because
* mnt_want_write first increments count, then smp_mb, then spins on
- * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
+ * WRITE_HOLD, so it can't be decremented by another CPU while
* we're counting up here.
*/
if (mnt_get_writers(mnt) > 0)
@@ -649,19 +657,42 @@ static inline int mnt_hold_writers(struct mount *mnt)
* Stop preventing write access to @mnt allowing callers to gain write access
* to @mnt again.
*
- * This function can only be called after a successful call to
- * mnt_hold_writers().
+ * This function can only be called after a call to mnt_hold_writers().
*
- * Context: This function expects lock_mount_hash() to be held.
+ * Context: This function expects to be in the same mount_locked_reader scope
+ * as the matching mnt_hold_writers().
*/
static inline void mnt_unhold_writers(struct mount *mnt)
{
+ if (!test_write_hold(mnt))
+ return;
/*
- * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
+ * MNT_READONLY must become visible before ~WRITE_HOLD, so writers
* that become unheld will see MNT_READONLY.
*/
smp_wmb();
- mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
+ clear_write_hold(mnt);
+}
+
+static inline void mnt_del_instance(struct mount *m)
+{
+ struct mount **p = m->mnt_pprev_for_sb;
+ struct mount *next = m->mnt_next_for_sb;
+
+ if (next)
+ next->mnt_pprev_for_sb = p;
+ *p = next;
+}
+
+static inline void mnt_add_instance(struct mount *m, struct super_block *s)
+{
+ struct mount *first = s->s_mounts;
+
+ if (first)
+ first->mnt_pprev_for_sb = &m->mnt_next_for_sb;
+ m->mnt_next_for_sb = first;
+ m->mnt_pprev_for_sb = &s->s_mounts;
+ s->s_mounts = m;
}
static int mnt_make_readonly(struct mount *mnt)
@@ -677,17 +708,17 @@ static int mnt_make_readonly(struct mount *mnt)
int sb_prepare_remount_readonly(struct super_block *sb)
{
- struct mount *mnt;
int err = 0;
- /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */
+ /* Racy optimization. Recheck the counter under WRITE_HOLD */
if (atomic_long_read(&sb->s_remove_count))
return -EBUSY;
- lock_mount_hash();
- list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
- if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
- err = mnt_hold_writers(mnt);
+ guard(mount_locked_reader)();
+
+ for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) {
+ if (!(m->mnt.mnt_flags & MNT_READONLY)) {
+ err = mnt_hold_writers(m);
if (err)
break;
}
@@ -697,11 +728,10 @@ int sb_prepare_remount_readonly(struct super_block *sb)
if (!err)
sb_start_ro_state_change(sb);
- list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
- if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
- mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
+ for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) {
+ if (test_write_hold(m))
+ clear_write_hold(m);
}
- unlock_mount_hash();
return err;
}
@@ -760,24 +790,16 @@ static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
}
/**
- * __lookup_mnt - find first child mount
+ * __lookup_mnt - mount hash lookup
* @mnt: parent mount
- * @dentry: mountpoint
- *
- * If @mnt has a child mount @c mounted @dentry find and return it.
+ * @dentry: dentry of mountpoint
*
- * Note that the child mount @c need not be unique. There are cases
- * where shadow mounts are created. For example, during mount
- * propagation when a source mount @mnt whose root got overmounted by a
- * mount @o after path lookup but before @namespace_sem could be
- * acquired gets copied and propagated. So @mnt gets copied including
- * @o. When @mnt is propagated to a destination mount @d that already
- * has another mount @n mounted at the same mountpoint then the source
- * mount @mnt will be tucked beneath @n, i.e., @n will be mounted on
- * @mnt and @mnt mounted on @d. Now both @n and @o are mounted at @mnt
- * on @dentry.
+ * If @mnt has a child mount @c mounted on @dentry find and return it.
+ * Caller must either hold the spinlock component of @mount_lock or
+ * hold rcu_read_lock(), sample the seqcount component before the call
+ * and recheck it afterwards.
*
- * Return: The first child of @mnt mounted @dentry or NULL.
+ * Return: The child of @mnt mounted on @dentry or %NULL.
*/
struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
{
@@ -790,21 +812,12 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
return NULL;
}
-/*
- * lookup_mnt - Return the first child mount mounted at path
- *
- * "First" means first mounted chronologically. If you create the
- * following mounts:
- *
- * mount /dev/sda1 /mnt
- * mount /dev/sda2 /mnt
- * mount /dev/sda3 /mnt
- *
- * Then lookup_mnt() on the base /mnt dentry in the root mount will
- * return successively the root dentry and vfsmount of /dev/sda1, then
- * /dev/sda2, then /dev/sda3, then NULL.
+/**
+ * lookup_mnt - Return the child mount mounted at given location
+ * @path: location in the namespace
*
- * lookup_mnt takes a reference to the found vfsmount.
+ * Acquires and returns a new reference to mount at given location
+ * or %NULL if nothing is mounted there.
*/
struct vfsmount *lookup_mnt(const struct path *path)
{
@@ -841,22 +854,20 @@ bool __is_local_mountpoint(const struct dentry *dentry)
{
struct mnt_namespace *ns = current->nsproxy->mnt_ns;
struct mount *mnt, *n;
- bool is_covered = false;
- down_read(&namespace_sem);
- rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
- is_covered = (mnt->mnt_mountpoint == dentry);
- if (is_covered)
- break;
- }
- up_read(&namespace_sem);
+ guard(namespace_shared)();
- return is_covered;
+ rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node)
+ if (mnt->mnt_mountpoint == dentry)
+ return true;
+
+ return false;
}
struct pinned_mountpoint {
struct hlist_node node;
struct mountpoint *mp;
+ struct mount *parent;
};
static bool lookup_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m)
@@ -947,7 +958,7 @@ static void unpin_mountpoint(struct pinned_mountpoint *m)
}
}
-static inline int check_mnt(struct mount *mnt)
+static inline int check_mnt(const struct mount *mnt)
{
return mnt->mnt_ns == current->nsproxy->mnt_ns;
}
@@ -1149,6 +1160,20 @@ static void commit_tree(struct mount *mnt)
touch_mnt_namespace(n);
}
+static void setup_mnt(struct mount *m, struct dentry *root)
+{
+ struct super_block *s = root->d_sb;
+
+ atomic_inc(&s->s_active);
+ m->mnt.mnt_sb = s;
+ m->mnt.mnt_root = dget(root);
+ m->mnt_mountpoint = m->mnt.mnt_root;
+ m->mnt_parent = m;
+
+ guard(mount_locked_reader)();
+ mnt_add_instance(m, s);
+}
+
/**
* vfs_create_mount - Create a mount for a configured superblock
* @fc: The configuration context with the superblock attached
@@ -1172,15 +1197,8 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc)
if (fc->sb_flags & SB_KERNMOUNT)
mnt->mnt.mnt_flags = MNT_INTERNAL;
- atomic_inc(&fc->root->d_sb->s_active);
- mnt->mnt.mnt_sb = fc->root->d_sb;
- mnt->mnt.mnt_root = dget(fc->root);
- mnt->mnt_mountpoint = mnt->mnt.mnt_root;
- mnt->mnt_parent = mnt;
+ setup_mnt(mnt, fc->root);
- lock_mount_hash();
- list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
- unlock_mount_hash();
return &mnt->mnt;
}
EXPORT_SYMBOL(vfs_create_mount);
@@ -1221,8 +1239,7 @@ struct vfsmount *vfs_kern_mount(struct file_system_type *type,
return ERR_CAST(fc);
if (name)
- ret = vfs_parse_fs_string(fc, "source",
- name, strlen(name));
+ ret = vfs_parse_fs_string(fc, "source", name);
if (!ret)
ret = parse_monolithic_mount_data(fc, data);
if (!ret)
@@ -1238,7 +1255,6 @@ EXPORT_SYMBOL_GPL(vfs_kern_mount);
static struct mount *clone_mnt(struct mount *old, struct dentry *root,
int flag)
{
- struct super_block *sb = old->mnt.mnt_sb;
struct mount *mnt;
int err;
@@ -1263,16 +1279,9 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
if (mnt->mnt_group_id)
set_mnt_shared(mnt);
- atomic_inc(&sb->s_active);
mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
- mnt->mnt.mnt_sb = sb;
- mnt->mnt.mnt_root = dget(root);
- mnt->mnt_mountpoint = mnt->mnt.mnt_root;
- mnt->mnt_parent = mnt;
- lock_mount_hash();
- list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
- unlock_mount_hash();
+ setup_mnt(mnt, root);
if (flag & CL_PRIVATE) // we are done with it
return mnt;
@@ -1378,7 +1387,7 @@ static void mntput_no_expire(struct mount *mnt)
mnt->mnt.mnt_flags |= MNT_DOOMED;
rcu_read_unlock();
- list_del(&mnt->mnt_instance);
+ mnt_del_instance(mnt);
if (unlikely(!list_empty(&mnt->mnt_expire)))
list_del(&mnt->mnt_expire);
@@ -1719,8 +1728,6 @@ static inline void namespace_lock(void)
down_write(&namespace_sem);
}
-DEFINE_GUARD(namespace_lock, struct rw_semaphore *, namespace_lock(), namespace_unlock())
-
enum umount_tree_flags {
UMOUNT_SYNC = 1,
UMOUNT_PROPAGATE = 2,
@@ -1785,6 +1792,8 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
if (how & UMOUNT_PROPAGATE)
propagate_umount(&tmp_list);
+ bulk_make_private(&tmp_list);
+
while (!list_empty(&tmp_list)) {
struct mnt_namespace *ns;
bool disconnect;
@@ -1809,7 +1818,6 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
umount_mnt(p);
}
}
- change_mnt_propagation(p, MS_PRIVATE);
if (disconnect)
hlist_add_head(&p->mnt_umount, &unmounted);
@@ -1969,10 +1977,11 @@ void __detach_mounts(struct dentry *dentry)
struct pinned_mountpoint mp = {};
struct mount *mnt;
- namespace_lock();
- lock_mount_hash();
+ guard(namespace_excl)();
+ guard(mount_writer)();
+
if (!lookup_mountpoint(dentry, &mp))
- goto out_unlock;
+ return;
event++;
while (mp.node.next) {
@@ -1984,9 +1993,6 @@ void __detach_mounts(struct dentry *dentry)
else umount_tree(mnt, UMOUNT_CONNECTED);
}
unpin_mountpoint(&mp);
-out_unlock:
- unlock_mount_hash();
- namespace_unlock();
}
/*
@@ -2025,7 +2031,7 @@ static int can_umount(const struct path *path, int flags)
}
// caller is responsible for flags being sane
-int path_umount(struct path *path, int flags)
+int path_umount(const struct path *path, int flags)
{
struct mount *mnt = real_mount(path->mnt);
int ret;
@@ -2238,7 +2244,7 @@ static inline bool extend_array(struct path **res, struct path **to_free,
return p;
}
-struct path *collect_paths(const struct path *path,
+const struct path *collect_paths(const struct path *path,
struct path *prealloc, unsigned count)
{
struct mount *root = real_mount(path->mnt);
@@ -2246,7 +2252,7 @@ struct path *collect_paths(const struct path *path,
struct path *res = prealloc, *to_free = NULL;
unsigned n = 0;
- guard(rwsem_read)(&namespace_sem);
+ guard(namespace_shared)();
if (!check_mnt(root))
return ERR_PTR(-EINVAL);
@@ -2272,9 +2278,9 @@ struct path *collect_paths(const struct path *path,
return res;
}
-void drop_collected_paths(struct path *paths, struct path *prealloc)
+void drop_collected_paths(const struct path *paths, const struct path *prealloc)
{
- for (struct path *p = paths; p->mnt; p++)
+ for (const struct path *p = paths; p->mnt; p++)
path_put(p);
if (paths != prealloc)
kfree(paths);
@@ -2301,7 +2307,7 @@ void dissolve_on_fput(struct vfsmount *mnt)
return;
}
- scoped_guard(namespace_lock, &namespace_sem) {
+ scoped_guard(namespace_excl) {
if (!anon_ns_root(m))
return;
@@ -2312,6 +2318,7 @@ void dissolve_on_fput(struct vfsmount *mnt)
}
}
+/* locks: namespace_shared && pinned(mnt) || mount_locked_reader */
static bool __has_locked_children(struct mount *mnt, struct dentry *dentry)
{
struct mount *child;
@@ -2328,12 +2335,8 @@ static bool __has_locked_children(struct mount *mnt, struct dentry *dentry)
bool has_locked_children(struct mount *mnt, struct dentry *dentry)
{
- bool res;
-
- read_seqlock_excl(&mount_lock);
- res = __has_locked_children(mnt, dentry);
- read_sequnlock_excl(&mount_lock);
- return res;
+ guard(mount_locked_reader)();
+ return __has_locked_children(mnt, dentry);
}
/*
@@ -2341,21 +2344,15 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry)
* specified subtree. Such references can act as pins for mount namespaces
* that aren't checked by the mount-cycle checking code, thereby allowing
* cycles to be made.
+ *
+ * locks: mount_locked_reader || namespace_shared && pinned(subtree)
*/
static bool check_for_nsfs_mounts(struct mount *subtree)
{
- struct mount *p;
- bool ret = false;
-
- lock_mount_hash();
- for (p = subtree; p; p = next_mnt(p, subtree))
+ for (struct mount *p = subtree; p; p = next_mnt(p, subtree))
if (mnt_ns_loop(p->mnt.mnt_root))
- goto out;
-
- ret = true;
-out:
- unlock_mount_hash();
- return ret;
+ return false;
+ return true;
}
/**
@@ -2375,7 +2372,7 @@ struct vfsmount *clone_private_mount(const struct path *path)
struct mount *old_mnt = real_mount(path->mnt);
struct mount *new_mnt;
- guard(rwsem_read)(&namespace_sem);
+ guard(namespace_shared)();
if (IS_MNT_UNBINDABLE(old_mnt))
return ERR_PTR(-EINVAL);
@@ -2496,8 +2493,7 @@ enum mnt_tree_flags_t {
/**
* attach_recursive_mnt - attach a source mount tree
* @source_mnt: mount tree to be attached
- * @dest_mnt: mount that @source_mnt will be mounted on
- * @dest_mp: the mountpoint @source_mnt will be mounted at
+ * @dest: the context for mounting at the place where the tree should go
*
* NOTE: in the table below explains the semantics when a source mount
* of a given type is attached to a destination mount of a given type.
@@ -2560,10 +2556,11 @@ enum mnt_tree_flags_t {
* Otherwise a negative error code is returned.
*/
static int attach_recursive_mnt(struct mount *source_mnt,
- struct mount *dest_mnt,
- struct mountpoint *dest_mp)
+ const struct pinned_mountpoint *dest)
{
struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
+ struct mount *dest_mnt = dest->parent;
+ struct mountpoint *dest_mp = dest->mp;
HLIST_HEAD(tree_list);
struct mnt_namespace *ns = dest_mnt->mnt_ns;
struct pinned_mountpoint root = {};
@@ -2643,10 +2640,9 @@ static int attach_recursive_mnt(struct mount *source_mnt,
child->mnt_mountpoint);
commit_tree(child);
if (q) {
+ struct mount *r = topmost_overmount(child);
struct mountpoint *mp = root.mp;
- struct mount *r = child;
- while (unlikely(r->overmount))
- r = r->overmount;
+
if (unlikely(shorter) && child != source_mnt)
mp = shorter;
mnt_change_mountpoint(r, mp, q);
@@ -2675,110 +2671,120 @@ static int attach_recursive_mnt(struct mount *source_mnt,
return err;
}
+static inline struct mount *where_to_mount(const struct path *path,
+ struct dentry **dentry,
+ bool beneath)
+{
+ struct mount *m;
+
+ if (unlikely(beneath)) {
+ m = topmost_overmount(real_mount(path->mnt));
+ *dentry = m->mnt_mountpoint;
+ return m->mnt_parent;
+ }
+ m = __lookup_mnt(path->mnt, path->dentry);
+ if (unlikely(m)) {
+ m = topmost_overmount(m);
+ *dentry = m->mnt.mnt_root;
+ return m;
+ }
+ *dentry = path->dentry;
+ return real_mount(path->mnt);
+}
+
/**
- * do_lock_mount - lock mount and mountpoint
- * @path: target path
- * @beneath: whether the intention is to mount beneath @path
- *
- * Follow the mount stack on @path until the top mount @mnt is found. If
- * the initial @path->{mnt,dentry} is a mountpoint lookup the first
- * mount stacked on top of it. Then simply follow @{mnt,mnt->mnt_root}
- * until nothing is stacked on top of it anymore.
+ * do_lock_mount - acquire environment for mounting
+ * @path: target path
+ * @res: context to set up
+ * @beneath: whether the intention is to mount beneath @path
*
- * Acquire the inode_lock() on the top mount's ->mnt_root to protect
- * against concurrent removal of the new mountpoint from another mount
- * namespace.
+ * To mount something at given location, we need
+ * namespace_sem locked exclusive
+ * inode of dentry we are mounting on locked exclusive
+ * struct mountpoint for that dentry
+ * struct mount we are mounting on
*
- * If @beneath is requested, acquire inode_lock() on @mnt's mountpoint
- * @mp on @mnt->mnt_parent must be acquired. This protects against a
- * concurrent unlink of @mp->mnt_dentry from another mount namespace
- * where @mnt doesn't have a child mount mounted @mp. A concurrent
- * removal of @mnt->mnt_root doesn't matter as nothing will be mounted
- * on top of it for @beneath.
+ * Results are stored in caller-supplied context (pinned_mountpoint);
+ * on success we have res->parent and res->mp pointing to parent and
+ * mountpoint respectively and res->node inserted into the ->m_list
+ * of the mountpoint, making sure the mountpoint won't disappear.
+ * On failure we have res->parent set to ERR_PTR(-E...), res->mp
+ * left NULL, res->node - empty.
+ * In case of success do_lock_mount returns with locks acquired (in
+ * proper order - inode lock nests outside of namespace_sem).
*
- * In addition, @beneath needs to make sure that @mnt hasn't been
- * unmounted or moved from its current mountpoint in between dropping
- * @mount_lock and acquiring @namespace_sem. For the !@beneath case @mnt
- * being unmounted would be detected later by e.g., calling
- * check_mnt(mnt) in the function it's called from. For the @beneath
- * case however, it's useful to detect it directly in do_lock_mount().
- * If @mnt hasn't been unmounted then @mnt->mnt_mountpoint still points
- * to @mnt->mnt_mp->m_dentry. But if @mnt has been unmounted it will
- * point to @mnt->mnt_root and @mnt->mnt_mp will be NULL.
+ * Request to mount on overmounted location is treated as "mount on
+ * top of whatever's overmounting it"; request to mount beneath
+ * a location - "mount immediately beneath the topmost mount at that
+ * place".
*
- * Return: Either the target mountpoint on the top mount or the top
- * mount's mountpoint.
+ * In all cases the location must not have been unmounted and the
+ * chosen mountpoint must be allowed to be mounted on. For "beneath"
+ * case we also require the location to be at the root of a mount
+ * that has a parent (i.e. is not a root of some namespace).
*/
-static int do_lock_mount(struct path *path, struct pinned_mountpoint *pinned, bool beneath)
+static void do_lock_mount(const struct path *path,
+ struct pinned_mountpoint *res,
+ bool beneath)
{
- struct vfsmount *mnt = path->mnt;
- struct dentry *dentry;
- struct path under = {};
- int err = -ENOENT;
+ int err;
- for (;;) {
- struct mount *m = real_mount(mnt);
+ if (unlikely(beneath) && !path_mounted(path)) {
+ res->parent = ERR_PTR(-EINVAL);
+ return;
+ }
- if (beneath) {
- path_put(&under);
- read_seqlock_excl(&mount_lock);
- under.mnt = mntget(&m->mnt_parent->mnt);
- under.dentry = dget(m->mnt_mountpoint);
- read_sequnlock_excl(&mount_lock);
- dentry = under.dentry;
- } else {
- dentry = path->dentry;
+ do {
+ struct dentry *dentry, *d;
+ struct mount *m, *n;
+
+ scoped_guard(mount_locked_reader) {
+ m = where_to_mount(path, &dentry, beneath);
+ if (&m->mnt != path->mnt) {
+ mntget(&m->mnt);
+ dget(dentry);
+ }
}
inode_lock(dentry->d_inode);
namespace_lock();
- if (unlikely(cant_mount(dentry) || !is_mounted(mnt)))
- break; // not to be mounted on
+ // check if the chain of mounts (if any) has changed.
+ scoped_guard(mount_locked_reader)
+ n = where_to_mount(path, &d, beneath);
- if (beneath && unlikely(m->mnt_mountpoint != dentry ||
- &m->mnt_parent->mnt != under.mnt)) {
- namespace_unlock();
- inode_unlock(dentry->d_inode);
- continue; // got moved
- }
+ if (unlikely(n != m || dentry != d))
+ err = -EAGAIN; // something moved, retry
+ else if (unlikely(cant_mount(dentry) || !is_mounted(path->mnt)))
+ err = -ENOENT; // not to be mounted on
+ else if (beneath && &m->mnt == path->mnt && !m->overmount)
+ err = -EINVAL;
+ else
+ err = get_mountpoint(dentry, res);
- mnt = lookup_mnt(path);
- if (unlikely(mnt)) {
+ if (unlikely(err)) {
+ res->parent = ERR_PTR(err);
namespace_unlock();
inode_unlock(dentry->d_inode);
- path_put(path);
- path->mnt = mnt;
- path->dentry = dget(mnt->mnt_root);
- continue; // got overmounted
+ } else {
+ res->parent = m;
}
- err = get_mountpoint(dentry, pinned);
- if (err)
- break;
- if (beneath) {
- /*
- * @under duplicates the references that will stay
- * at least until namespace_unlock(), so the path_put()
- * below is safe (and OK to do under namespace_lock -
- * we are not dropping the final references here).
- */
- path_put(&under);
+ /*
+ * Drop the temporary references. This is subtle - on success
+ * we are doing that under namespace_sem, which would normally
+ * be forbidden. However, in that case we are guaranteed that
+ * refcounts won't reach zero, since we know that path->mnt
+ * is mounted and thus all mounts reachable from it are pinned
+ * and stable, along with their mountpoints and roots.
+ */
+ if (&m->mnt != path->mnt) {
+ dput(dentry);
+ mntput(&m->mnt);
}
- return 0;
- }
- namespace_unlock();
- inode_unlock(dentry->d_inode);
- if (beneath)
- path_put(&under);
- return err;
-}
-
-static inline int lock_mount(struct path *path, struct pinned_mountpoint *m)
-{
- return do_lock_mount(path, m, false);
+ } while (err == -EAGAIN);
}
-static void unlock_mount(struct pinned_mountpoint *m)
+static void __unlock_mount(struct pinned_mountpoint *m)
{
inode_unlock(m->mp->m_dentry->d_inode);
read_seqlock_excl(&mount_lock);
@@ -2787,16 +2793,30 @@ static void unlock_mount(struct pinned_mountpoint *m)
namespace_unlock();
}
-static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
+static inline void unlock_mount(struct pinned_mountpoint *m)
+{
+ if (!IS_ERR(m->parent))
+ __unlock_mount(m);
+}
+
+#define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \
+ struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
+ do_lock_mount((path), &mp, (beneath))
+#define LOCK_MOUNT(mp, path) LOCK_MOUNT_MAYBE_BENEATH(mp, (path), false)
+#define LOCK_MOUNT_EXACT(mp, path) \
+ struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
+ lock_mount_exact((path), &mp)
+
+static int graft_tree(struct mount *mnt, const struct pinned_mountpoint *mp)
{
if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER)
return -EINVAL;
- if (d_is_dir(mp->m_dentry) !=
+ if (d_is_dir(mp->mp->m_dentry) !=
d_is_dir(mnt->mnt.mnt_root))
return -ENOTDIR;
- return attach_recursive_mnt(mnt, p, mp);
+ return attach_recursive_mnt(mnt, mp);
}
static int may_change_propagation(const struct mount *m)
@@ -2832,13 +2852,13 @@ static int flags_to_propagation_type(int ms_flags)
/*
* recursively change the type of the mountpoint.
*/
-static int do_change_type(struct path *path, int ms_flags)
+static int do_change_type(const struct path *path, int ms_flags)
{
struct mount *m;
struct mount *mnt = real_mount(path->mnt);
int recurse = ms_flags & MS_REC;
int type;
- int err = 0;
+ int err;
if (!path_mounted(path))
return -EINVAL;
@@ -2847,23 +2867,22 @@ static int do_change_type(struct path *path, int ms_flags)
if (!type)
return -EINVAL;
- namespace_lock();
+ guard(namespace_excl)();
+
err = may_change_propagation(mnt);
if (err)
- goto out_unlock;
+ return err;
if (type == MS_SHARED) {
err = invent_group_ids(mnt, recurse);
if (err)
- goto out_unlock;
+ return err;
}
for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
change_mnt_propagation(m, type);
- out_unlock:
- namespace_unlock();
- return err;
+ return 0;
}
/* may_copy_tree() - check if a mount tree can be copied
@@ -2909,7 +2928,7 @@ static int do_change_type(struct path *path, int ms_flags)
*
* Returns true if the mount tree can be copied, false otherwise.
*/
-static inline bool may_copy_tree(struct path *path)
+static inline bool may_copy_tree(const struct path *path)
{
struct mount *mnt = real_mount(path->mnt);
const struct dentry_operations *d_op;
@@ -2931,7 +2950,7 @@ static inline bool may_copy_tree(struct path *path)
}
-static struct mount *__do_loopback(struct path *old_path, int recurse)
+static struct mount *__do_loopback(const struct path *old_path, int recurse)
{
struct mount *old = real_mount(old_path->mnt);
@@ -2953,12 +2972,11 @@ static struct mount *__do_loopback(struct path *old_path, int recurse)
/*
* do loopback mount.
*/
-static int do_loopback(struct path *path, const char *old_name,
- int recurse)
+static int do_loopback(const struct path *path, const char *old_name,
+ int recurse)
{
- struct path old_path;
- struct mount *mnt = NULL, *parent;
- struct pinned_mountpoint mp = {};
+ struct path old_path __free(path_put) = {};
+ struct mount *mnt = NULL;
int err;
if (!old_name || !*old_name)
return -EINVAL;
@@ -2966,49 +2984,40 @@ static int do_loopback(struct path *path, const char *old_name,
if (err)
return err;
- err = -EINVAL;
if (mnt_ns_loop(old_path.dentry))
- goto out;
+ return -EINVAL;
- err = lock_mount(path, &mp);
- if (err)
- goto out;
+ LOCK_MOUNT(mp, path);
+ if (IS_ERR(mp.parent))
+ return PTR_ERR(mp.parent);
- parent = real_mount(path->mnt);
- if (!check_mnt(parent))
- goto out2;
+ if (!check_mnt(mp.parent))
+ return -EINVAL;
mnt = __do_loopback(&old_path, recurse);
- if (IS_ERR(mnt)) {
- err = PTR_ERR(mnt);
- goto out2;
- }
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
- err = graft_tree(mnt, parent, mp.mp);
+ err = graft_tree(mnt, &mp);
if (err) {
lock_mount_hash();
umount_tree(mnt, UMOUNT_SYNC);
unlock_mount_hash();
}
-out2:
- unlock_mount(&mp);
-out:
- path_put(&old_path);
return err;
}
-static struct file *open_detached_copy(struct path *path, bool recursive)
+static struct mnt_namespace *get_detached_copy(const struct path *path, bool recursive)
{
struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns;
struct user_namespace *user_ns = mnt_ns->user_ns;
struct mount *mnt, *p;
- struct file *file;
ns = alloc_mnt_ns(user_ns, true);
if (IS_ERR(ns))
- return ERR_CAST(ns);
+ return ns;
- namespace_lock();
+ guard(namespace_excl)();
/*
* Record the sequence number of the source mount namespace.
@@ -3025,23 +3034,28 @@ static struct file *open_detached_copy(struct path *path, bool recursive)
mnt = __do_loopback(path, recursive);
if (IS_ERR(mnt)) {
- namespace_unlock();
- free_mnt_ns(ns);
+ emptied_ns = ns;
return ERR_CAST(mnt);
}
- lock_mount_hash();
for (p = mnt; p; p = next_mnt(p, mnt)) {
mnt_add_to_ns(ns, p);
ns->nr_mounts++;
}
ns->root = mnt;
- mntget(&mnt->mnt);
- unlock_mount_hash();
- namespace_unlock();
+ return ns;
+}
+
+static struct file *open_detached_copy(struct path *path, bool recursive)
+{
+ struct mnt_namespace *ns = get_detached_copy(path, recursive);
+ struct file *file;
+
+ if (IS_ERR(ns))
+ return ERR_CAST(ns);
mntput(path->mnt);
- path->mnt = &mnt->mnt;
+ path->mnt = mntget(&ns->root->mnt);
file = dentry_open(path, O_PATH, current_cred());
if (IS_ERR(file))
dissolve_on_fput(path->mnt);
@@ -3158,7 +3172,8 @@ static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
touch_mnt_namespace(mnt->mnt_ns);
}
-static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt)
+static void mnt_warn_timestamp_expiry(const struct path *mountpoint,
+ struct vfsmount *mnt)
{
struct super_block *sb = mnt->mnt_sb;
@@ -3192,7 +3207,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *
* superblock it refers to. This is triggered by specifying MS_REMOUNT|MS_BIND
* to mount(2).
*/
-static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)
+static int do_reconfigure_mnt(const struct path *path, unsigned int mnt_flags)
{
struct super_block *sb = path->mnt->mnt_sb;
struct mount *mnt = real_mount(path->mnt);
@@ -3229,7 +3244,7 @@ static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)
* If you've mounted a non-root directory somewhere and want to do remount
* on it - tough luck.
*/
-static int do_remount(struct path *path, int sb_flags,
+static int do_remount(const struct path *path, int sb_flags,
int mnt_flags, void *data)
{
int err;
@@ -3287,49 +3302,46 @@ static inline int tree_contains_unbindable(struct mount *mnt)
return 0;
}
-static int do_set_group(struct path *from_path, struct path *to_path)
+static int do_set_group(const struct path *from_path, const struct path *to_path)
{
- struct mount *from, *to;
+ struct mount *from = real_mount(from_path->mnt);
+ struct mount *to = real_mount(to_path->mnt);
int err;
- from = real_mount(from_path->mnt);
- to = real_mount(to_path->mnt);
-
- namespace_lock();
+ guard(namespace_excl)();
err = may_change_propagation(from);
if (err)
- goto out;
+ return err;
err = may_change_propagation(to);
if (err)
- goto out;
+ return err;
- err = -EINVAL;
/* To and From paths should be mount roots */
if (!path_mounted(from_path))
- goto out;
+ return -EINVAL;
if (!path_mounted(to_path))
- goto out;
+ return -EINVAL;
/* Setting sharing groups is only allowed across same superblock */
if (from->mnt.mnt_sb != to->mnt.mnt_sb)
- goto out;
+ return -EINVAL;
/* From mount root should be wider than To mount root */
if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root))
- goto out;
+ return -EINVAL;
/* From mount should not have locked children in place of To's root */
if (__has_locked_children(from, to->mnt.mnt_root))
- goto out;
+ return -EINVAL;
/* Setting sharing groups is only allowed on private mounts */
if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to))
- goto out;
+ return -EINVAL;
/* From should not be private */
if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from))
- goto out;
+ return -EINVAL;
if (IS_MNT_SLAVE(from)) {
hlist_add_behind(&to->mnt_slave, &from->mnt_slave);
@@ -3341,11 +3353,7 @@ static int do_set_group(struct path *from_path, struct path *to_path)
list_add(&to->mnt_share, &from->mnt_share);
set_mnt_shared(to);
}
-
- err = 0;
-out:
- namespace_unlock();
- return err;
+ return 0;
}
/**
@@ -3389,17 +3397,15 @@ static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2)
/**
* can_move_mount_beneath - check that we can mount beneath the top mount
- * @from: mount to mount beneath
- * @to: mount under which to mount
- * @mp: mountpoint of @to
+ * @mnt_from: mount we are trying to move
+ * @mnt_to: mount under which to mount
+ * @mp: mountpoint of @mnt_to
*
- * - Make sure that @to->dentry is actually the root of a mount under
- * which we can mount another mount.
* - Make sure that nothing can be mounted beneath the caller's current
* root or the rootfs of the namespace.
* - Make sure that the caller can unmount the topmost mount ensuring
* that the caller could reveal the underlying mountpoint.
- * - Ensure that nothing has been mounted on top of @from before we
+ * - Ensure that nothing has been mounted on top of @mnt_from before we
* grabbed @namespace_sem to avoid creating pointless shadow mounts.
* - Prevent mounting beneath a mount if the propagation relationship
* between the source mount, parent mount, and top mount would lead to
@@ -3408,25 +3414,17 @@ static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2)
* Context: This function expects namespace_lock() to be held.
* Return: On success 0, and on error a negative error code is returned.
*/
-static int can_move_mount_beneath(const struct path *from,
- const struct path *to,
+static int can_move_mount_beneath(const struct mount *mnt_from,
+ const struct mount *mnt_to,
const struct mountpoint *mp)
{
- struct mount *mnt_from = real_mount(from->mnt),
- *mnt_to = real_mount(to->mnt),
- *parent_mnt_to = mnt_to->mnt_parent;
-
- if (!mnt_has_parent(mnt_to))
- return -EINVAL;
-
- if (!path_mounted(to))
- return -EINVAL;
+ struct mount *parent_mnt_to = mnt_to->mnt_parent;
if (IS_MNT_LOCKED(mnt_to))
return -EINVAL;
/* Avoid creating shadow mounts during mount propagation. */
- if (path_overmounted(from))
+ if (mnt_from->overmount)
return -EINVAL;
/*
@@ -3517,97 +3515,83 @@ static inline bool may_use_mount(struct mount *mnt)
return check_anonymous_mnt(mnt);
}
-static int do_move_mount(struct path *old_path,
- struct path *new_path, enum mnt_tree_flags_t flags)
+static int do_move_mount(const struct path *old_path,
+ const struct path *new_path,
+ enum mnt_tree_flags_t flags)
{
- struct mnt_namespace *ns;
- struct mount *p;
- struct mount *old;
- struct mount *parent;
- struct pinned_mountpoint mp;
+ struct mount *old = real_mount(old_path->mnt);
int err;
bool beneath = flags & MNT_TREE_BENEATH;
- err = do_lock_mount(new_path, &mp, beneath);
- if (err)
- return err;
+ if (!path_mounted(old_path))
+ return -EINVAL;
- old = real_mount(old_path->mnt);
- p = real_mount(new_path->mnt);
- parent = old->mnt_parent;
- ns = old->mnt_ns;
+ if (d_is_dir(new_path->dentry) != d_is_dir(old_path->dentry))
+ return -EINVAL;
- err = -EINVAL;
+ LOCK_MOUNT_MAYBE_BENEATH(mp, new_path, beneath);
+ if (IS_ERR(mp.parent))
+ return PTR_ERR(mp.parent);
if (check_mnt(old)) {
/* if the source is in our namespace... */
/* ... it should be detachable from parent */
if (!mnt_has_parent(old) || IS_MNT_LOCKED(old))
- goto out;
+ return -EINVAL;
+ /* ... which should not be shared */
+ if (IS_MNT_SHARED(old->mnt_parent))
+ return -EINVAL;
/* ... and the target should be in our namespace */
- if (!check_mnt(p))
- goto out;
- /* parent of the source should not be shared */
- if (IS_MNT_SHARED(parent))
- goto out;
+ if (!check_mnt(mp.parent))
+ return -EINVAL;
} else {
/*
* otherwise the source must be the root of some anon namespace.
*/
if (!anon_ns_root(old))
- goto out;
+ return -EINVAL;
/*
* Bail out early if the target is within the same namespace -
* subsequent checks would've rejected that, but they lose
* some corner cases if we check it early.
*/
- if (ns == p->mnt_ns)
- goto out;
+ if (old->mnt_ns == mp.parent->mnt_ns)
+ return -EINVAL;
/*
* Target should be either in our namespace or in an acceptable
* anon namespace, sensu check_anonymous_mnt().
*/
- if (!may_use_mount(p))
- goto out;
+ if (!may_use_mount(mp.parent))
+ return -EINVAL;
}
- if (!path_mounted(old_path))
- goto out;
-
- if (d_is_dir(new_path->dentry) !=
- d_is_dir(old_path->dentry))
- goto out;
-
if (beneath) {
- err = can_move_mount_beneath(old_path, new_path, mp.mp);
- if (err)
- goto out;
+ struct mount *over = real_mount(new_path->mnt);
- err = -EINVAL;
- p = p->mnt_parent;
+ if (mp.parent != over->mnt_parent)
+ over = mp.parent->overmount;
+ err = can_move_mount_beneath(old, over, mp.mp);
+ if (err)
+ return err;
}
/*
* Don't move a mount tree containing unbindable mounts to a destination
* mount which is shared.
*/
- if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
- goto out;
- err = -ELOOP;
+ if (IS_MNT_SHARED(mp.parent) && tree_contains_unbindable(old))
+ return -EINVAL;
if (!check_for_nsfs_mounts(old))
- goto out;
- if (mount_is_ancestor(old, p))
- goto out;
+ return -ELOOP;
+ if (mount_is_ancestor(old, mp.parent))
+ return -ELOOP;
- err = attach_recursive_mnt(old, p, mp.mp);
-out:
- unlock_mount(&mp);
- return err;
+ return attach_recursive_mnt(old, &mp);
}
-static int do_move_mount_old(struct path *path, const char *old_name)
+static int do_move_mount_old(const struct path *path, const char *old_name)
{
- struct path old_path;
+ struct path old_path __free(path_put) = {};
int err;
if (!old_name || !*old_name)
@@ -3617,18 +3601,19 @@ static int do_move_mount_old(struct path *path, const char *old_name)
if (err)
return err;
- err = do_move_mount(&old_path, path, 0);
- path_put(&old_path);
- return err;
+ return do_move_mount(&old_path, path, 0);
}
/*
* add a mount into a namespace's mount tree
*/
-static int do_add_mount(struct mount *newmnt, struct mountpoint *mp,
- const struct path *path, int mnt_flags)
+static int do_add_mount(struct mount *newmnt, const struct pinned_mountpoint *mp,
+ int mnt_flags)
{
- struct mount *parent = real_mount(path->mnt);
+ struct mount *parent = mp->parent;
+
+ if (IS_ERR(parent))
+ return PTR_ERR(parent);
mnt_flags &= ~MNT_INTERNAL_FLAGS;
@@ -3642,14 +3627,15 @@ static int do_add_mount(struct mount *newmnt, struct mountpoint *mp,
}
/* Refuse the same filesystem on the same mount point */
- if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && path_mounted(path))
+ if (parent->mnt.mnt_sb == newmnt->mnt.mnt_sb &&
+ parent->mnt.mnt_root == mp->mp->m_dentry)
return -EBUSY;
if (d_is_symlink(newmnt->mnt.mnt_root))
return -EINVAL;
newmnt->mnt.mnt_flags = mnt_flags;
- return graft_tree(newmnt, parent, mp);
+ return graft_tree(newmnt, mp);
}
static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags);
@@ -3658,41 +3644,32 @@ static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags
* Create a new mount using a superblock configuration and request it
* be added to the namespace tree.
*/
-static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
+static int do_new_mount_fc(struct fs_context *fc, const struct path *mountpoint,
unsigned int mnt_flags)
{
- struct vfsmount *mnt;
- struct pinned_mountpoint mp = {};
- struct super_block *sb = fc->root->d_sb;
+ struct super_block *sb;
+ struct vfsmount *mnt __free(mntput) = fc_mount(fc);
int error;
- error = security_sb_kern_mount(sb);
- if (!error && mount_too_revealing(sb, &mnt_flags)) {
- errorfcp(fc, "VFS", "Mount too revealing");
- error = -EPERM;
- }
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
- if (unlikely(error)) {
- fc_drop_locked(fc);
+ sb = fc->root->d_sb;
+ error = security_sb_kern_mount(sb);
+ if (unlikely(error))
return error;
- }
- up_write(&sb->s_umount);
-
- mnt = vfs_create_mount(fc);
- if (IS_ERR(mnt))
- return PTR_ERR(mnt);
+ if (unlikely(mount_too_revealing(sb, &mnt_flags))) {
+ errorfcp(fc, "VFS", "Mount too revealing");
+ return -EPERM;
+ }
mnt_warn_timestamp_expiry(mountpoint, mnt);
- error = lock_mount(mountpoint, &mp);
- if (!error) {
- error = do_add_mount(real_mount(mnt), mp.mp,
- mountpoint, mnt_flags);
- unlock_mount(&mp);
- }
- if (error < 0)
- mntput(mnt);
+ LOCK_MOUNT(mp, mountpoint);
+ error = do_add_mount(real_mount(mnt), &mp, mnt_flags);
+ if (!error)
+ retain_and_null_ptr(mnt); // consumed on success
return error;
}
@@ -3700,8 +3677,9 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
* create a new mount for userspace and request it to be added into the
* namespace's tree
*/
-static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
- int mnt_flags, const char *name, void *data)
+static int do_new_mount(const struct path *path, const char *fstype,
+ int sb_flags, int mnt_flags,
+ const char *name, void *data)
{
struct file_system_type *type;
struct fs_context *fc;
@@ -3738,27 +3716,46 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
fc->oldapi = true;
if (subtype)
- err = vfs_parse_fs_string(fc, "subtype",
- subtype, strlen(subtype));
+ err = vfs_parse_fs_string(fc, "subtype", subtype);
if (!err && name)
- err = vfs_parse_fs_string(fc, "source", name, strlen(name));
+ err = vfs_parse_fs_string(fc, "source", name);
if (!err)
err = parse_monolithic_mount_data(fc, data);
if (!err && !mount_capable(fc))
err = -EPERM;
if (!err)
- err = vfs_get_tree(fc);
- if (!err)
err = do_new_mount_fc(fc, path, mnt_flags);
put_fs_context(fc);
return err;
}
-int finish_automount(struct vfsmount *m, const struct path *path)
+static void lock_mount_exact(const struct path *path,
+ struct pinned_mountpoint *mp)
{
struct dentry *dentry = path->dentry;
- struct pinned_mountpoint mp = {};
+ int err;
+
+ inode_lock(dentry->d_inode);
+ namespace_lock();
+ if (unlikely(cant_mount(dentry)))
+ err = -ENOENT;
+ else if (path_overmounted(path))
+ err = -EBUSY;
+ else
+ err = get_mountpoint(dentry, mp);
+ if (unlikely(err)) {
+ namespace_unlock();
+ inode_unlock(dentry->d_inode);
+ mp->parent = ERR_PTR(err);
+ } else {
+ mp->parent = real_mount(path->mnt);
+ }
+}
+
+int finish_automount(struct vfsmount *__m, const struct path *path)
+{
+ struct vfsmount *m __free(mntput) = __m;
struct mount *mnt;
int err;
@@ -3769,43 +3766,21 @@ int finish_automount(struct vfsmount *m, const struct path *path)
mnt = real_mount(m);
- if (m->mnt_sb == path->mnt->mnt_sb &&
- m->mnt_root == dentry) {
- err = -ELOOP;
- goto discard;
- }
+ if (m->mnt_root == path->dentry)
+ return -ELOOP;
/*
- * we don't want to use lock_mount() - in this case finding something
+ * we don't want to use LOCK_MOUNT() - in this case finding something
* that overmounts our mountpoint to be means "quitely drop what we've
* got", not "try to mount it on top".
*/
- inode_lock(dentry->d_inode);
- namespace_lock();
- if (unlikely(cant_mount(dentry))) {
- err = -ENOENT;
- goto discard_locked;
- }
- if (path_overmounted(path)) {
- err = 0;
- goto discard_locked;
- }
- err = get_mountpoint(dentry, &mp);
- if (err)
- goto discard_locked;
-
- err = do_add_mount(mnt, mp.mp, path,
- path->mnt->mnt_flags | MNT_SHRINKABLE);
- unlock_mount(&mp);
- if (unlikely(err))
- goto discard;
- return 0;
+ LOCK_MOUNT_EXACT(mp, path);
+ if (mp.parent == ERR_PTR(-EBUSY))
+ return 0;
-discard_locked:
- namespace_unlock();
- inode_unlock(dentry->d_inode);
-discard:
- mntput(m);
+ err = do_add_mount(mnt, &mp, path->mnt->mnt_flags | MNT_SHRINKABLE);
+ if (likely(!err))
+ retain_and_null_ptr(m);
return err;
}
@@ -3816,9 +3791,8 @@ discard:
*/
void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
{
- read_seqlock_excl(&mount_lock);
+ guard(mount_locked_reader)();
list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
- read_sequnlock_excl(&mount_lock);
}
EXPORT_SYMBOL(mnt_set_expiry);
@@ -3835,8 +3809,8 @@ void mark_mounts_for_expiry(struct list_head *mounts)
if (list_empty(mounts))
return;
- namespace_lock();
- lock_mount_hash();
+ guard(namespace_excl)();
+ guard(mount_writer)();
/* extract from the expiration list every vfsmount that matches the
* following criteria:
@@ -3858,8 +3832,6 @@ void mark_mounts_for_expiry(struct list_head *mounts)
touch_mnt_namespace(mnt->mnt_ns);
umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
}
- unlock_mount_hash();
- namespace_unlock();
}
EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
@@ -3987,7 +3959,7 @@ static char *copy_mount_string(const void __user *data)
* Therefore, if this magic number is present, it carries no information
* and must be discarded.
*/
-int path_mount(const char *dev_name, struct path *path,
+int path_mount(const char *dev_name, const struct path *path,
const char *type_page, unsigned long flags, void *data_page)
{
unsigned int mnt_flags = 0, sb_flags;
@@ -4069,15 +4041,13 @@ int path_mount(const char *dev_name, struct path *path,
int do_mount(const char *dev_name, const char __user *dir_name,
const char *type_page, unsigned long flags, void *data_page)
{
- struct path path;
+ struct path path __free(path_put) = {};
int ret;
ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path);
if (ret)
return ret;
- ret = path_mount(dev_name, &path, type_page, flags, data_page);
- path_put(&path);
- return ret;
+ return path_mount(dev_name, &path, type_page, flags, data_page);
}
static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns)
@@ -4138,7 +4108,8 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
struct user_namespace *user_ns, struct fs_struct *new_fs)
{
struct mnt_namespace *new_ns;
- struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
+ struct vfsmount *rootmnt __free(mntput) = NULL;
+ struct vfsmount *pwdmnt __free(mntput) = NULL;
struct mount *p, *q;
struct mount *old;
struct mount *new;
@@ -4157,23 +4128,19 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
if (IS_ERR(new_ns))
return new_ns;
- namespace_lock();
+ guard(namespace_excl)();
/* First pass: copy the tree topology */
copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
if (user_ns != ns->user_ns)
copy_flags |= CL_SLAVE;
new = copy_tree(old, old->mnt.mnt_root, copy_flags);
if (IS_ERR(new)) {
- namespace_unlock();
- ns_common_free(ns);
- dec_mnt_namespaces(new_ns->ucounts);
- mnt_ns_release(new_ns);
+ emptied_ns = new_ns;
return ERR_CAST(new);
}
if (user_ns != ns->user_ns) {
- lock_mount_hash();
+ guard(mount_writer)();
lock_mnt_tree(new);
- unlock_mount_hash();
}
new_ns->root = new;
@@ -4205,13 +4172,6 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
while (p->mnt.mnt_root != q->mnt.mnt_root)
p = next_mnt(skip_mnt_tree(p), old);
}
- namespace_unlock();
-
- if (rootmnt)
- mntput(rootmnt);
- if (pwdmnt)
- mntput(pwdmnt);
-
ns_tree_add_raw(new_ns);
return new_ns;
}
@@ -4436,7 +4396,8 @@ err_unlock:
return ret;
}
-static inline int vfs_move_mount(struct path *from_path, struct path *to_path,
+static inline int vfs_move_mount(const struct path *from_path,
+ const struct path *to_path,
enum mnt_tree_flags_t mflags)
{
int ret;
@@ -4542,7 +4503,7 @@ SYSCALL_DEFINE5(move_mount,
/*
* Return true if path is reachable from root
*
- * namespace_sem or mount_lock is held
+ * locks: mount_locked_reader || namespace_shared && is_mounted(mnt)
*/
bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
const struct path *root)
@@ -4556,11 +4517,8 @@ bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
bool path_is_under(const struct path *path1, const struct path *path2)
{
- bool res;
- read_seqlock_excl(&mount_lock);
- res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
- read_sequnlock_excl(&mount_lock);
- return res;
+ guard(mount_locked_reader)();
+ return is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
}
EXPORT_SYMBOL(path_is_under);
@@ -4592,9 +4550,10 @@ EXPORT_SYMBOL(path_is_under);
SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
const char __user *, put_old)
{
- struct path new, old, root;
+ struct path new __free(path_put) = {};
+ struct path old __free(path_put) = {};
+ struct path root __free(path_put) = {};
struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent;
- struct pinned_mountpoint old_mp = {};
int error;
if (!may_mount())
@@ -4603,57 +4562,54 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
error = user_path_at(AT_FDCWD, new_root,
LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new);
if (error)
- goto out0;
+ return error;
error = user_path_at(AT_FDCWD, put_old,
LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old);
if (error)
- goto out1;
+ return error;
error = security_sb_pivotroot(&old, &new);
if (error)
- goto out2;
+ return error;
get_fs_root(current->fs, &root);
- error = lock_mount(&old, &old_mp);
- if (error)
- goto out3;
- error = -EINVAL;
+ LOCK_MOUNT(old_mp, &old);
+ old_mnt = old_mp.parent;
+ if (IS_ERR(old_mnt))
+ return PTR_ERR(old_mnt);
+
new_mnt = real_mount(new.mnt);
root_mnt = real_mount(root.mnt);
- old_mnt = real_mount(old.mnt);
ex_parent = new_mnt->mnt_parent;
root_parent = root_mnt->mnt_parent;
if (IS_MNT_SHARED(old_mnt) ||
IS_MNT_SHARED(ex_parent) ||
IS_MNT_SHARED(root_parent))
- goto out4;
+ return -EINVAL;
if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
- goto out4;
+ return -EINVAL;
if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
- goto out4;
- error = -ENOENT;
+ return -EINVAL;
if (d_unlinked(new.dentry))
- goto out4;
- error = -EBUSY;
+ return -ENOENT;
if (new_mnt == root_mnt || old_mnt == root_mnt)
- goto out4; /* loop, on the same file system */
- error = -EINVAL;
+ return -EBUSY; /* loop, on the same file system */
if (!path_mounted(&root))
- goto out4; /* not a mountpoint */
+ return -EINVAL; /* not a mountpoint */
if (!mnt_has_parent(root_mnt))
- goto out4; /* absolute root */
+ return -EINVAL; /* absolute root */
if (!path_mounted(&new))
- goto out4; /* not a mountpoint */
+ return -EINVAL; /* not a mountpoint */
if (!mnt_has_parent(new_mnt))
- goto out4; /* absolute root */
+ return -EINVAL; /* absolute root */
/* make sure we can reach put_old from new_root */
- if (!is_path_reachable(old_mnt, old.dentry, &new))
- goto out4;
+ if (!is_path_reachable(old_mnt, old_mp.mp->m_dentry, &new))
+ return -EINVAL;
/* make certain new is below the root */
if (!is_path_reachable(new_mnt, new.dentry, &root))
- goto out4;
+ return -EINVAL;
lock_mount_hash();
umount_mnt(new_mnt);
if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
@@ -4672,17 +4628,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
mnt_notify_add(root_mnt);
mnt_notify_add(new_mnt);
chroot_fs_refs(&root, &new);
- error = 0;
-out4:
- unlock_mount(&old_mp);
-out3:
- path_put(&root);
-out2:
- path_put(&old);
-out1:
- path_put(&new);
-out0:
- return error;
+ return 0;
}
static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt)
@@ -4772,8 +4718,10 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
if (!mnt_allow_writers(kattr, m)) {
err = mnt_hold_writers(m);
- if (err)
+ if (err) {
+ m = next_mnt(m, mnt);
break;
+ }
}
if (!(kattr->kflags & MOUNT_KATTR_RECURSE))
@@ -4781,25 +4729,9 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
}
if (err) {
- struct mount *p;
-
- /*
- * If we had to call mnt_hold_writers() MNT_WRITE_HOLD will
- * be set in @mnt_flags. The loop unsets MNT_WRITE_HOLD for all
- * mounts and needs to take care to include the first mount.
- */
- for (p = mnt; p; p = next_mnt(p, mnt)) {
- /* If we had to hold writers unblock them. */
- if (p->mnt.mnt_flags & MNT_WRITE_HOLD)
- mnt_unhold_writers(p);
-
- /*
- * We're done once the first mount we changed got
- * MNT_WRITE_HOLD unset.
- */
- if (p == m)
- break;
- }
+ /* undo all mnt_hold_writers() we'd done */
+ for (struct mount *p = mnt; p != m; p = next_mnt(p, mnt))
+ mnt_unhold_writers(p);
}
return err;
}
@@ -4830,8 +4762,7 @@ static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
WRITE_ONCE(m->mnt.mnt_flags, flags);
/* If we had to hold writers unblock them. */
- if (m->mnt.mnt_flags & MNT_WRITE_HOLD)
- mnt_unhold_writers(m);
+ mnt_unhold_writers(m);
if (kattr->propagation)
change_mnt_propagation(m, kattr->propagation);
@@ -4841,7 +4772,7 @@ static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
touch_mnt_namespace(mnt->mnt_ns);
}
-static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
+static int do_mount_setattr(const struct path *path, struct mount_kattr *kattr)
{
struct mount *mnt = real_mount(path->mnt);
int err = 0;
@@ -5639,6 +5570,7 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
STATMOUNT_MNT_UIDMAP | \
STATMOUNT_MNT_GIDMAP)
+/* locks: namespace_shared */
static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
struct mnt_namespace *ns)
{
@@ -5885,7 +5817,7 @@ retry:
if (ret)
return ret;
- scoped_guard(rwsem_read, &namespace_sem)
+ scoped_guard(namespace_shared)
ret = do_statmount(ks, kreq.mnt_id, kreq.mnt_ns_id, ns);
if (!ret)
@@ -5906,6 +5838,7 @@ struct klistmount {
struct path root;
};
+/* locks: namespace_shared */
static ssize_t do_listmount(struct klistmount *kls, bool reverse)
{
struct mnt_namespace *ns = kls->ns;
@@ -6040,7 +5973,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
* We only need to guard against mount topology changes as
* listmount() doesn't care about any mount properties.
*/
- scoped_guard(rwsem_read, &namespace_sem)
+ scoped_guard(namespace_shared)
ret = do_listmount(&kls, (flags & LISTMOUNT_REVERSE));
if (ret <= 0)
return ret;
@@ -6127,12 +6060,10 @@ void put_mnt_ns(struct mnt_namespace *ns)
{
if (!ns_ref_put(ns))
return;
- namespace_lock();
+ guard(namespace_excl)();
emptied_ns = ns;
- lock_mount_hash();
+ guard(mount_writer)();
umount_tree(ns->root, 0);
- unlock_mount_hash();
- namespace_unlock();
}
struct vfsmount *kern_mount(struct file_system_type *type)
@@ -6181,25 +6112,18 @@ bool our_mnt(struct vfsmount *mnt)
bool current_chrooted(void)
{
/* Does the current process have a non-standard root */
- struct path ns_root;
- struct path fs_root;
- bool chrooted;
-
- /* Find the namespace root */
- ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;
- ns_root.dentry = ns_root.mnt->mnt_root;
- path_get(&ns_root);
- while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
- ;
+ struct path fs_root __free(path_put) = {};
+ struct mount *root;
get_fs_root(current->fs, &fs_root);
- chrooted = !path_equal(&fs_root, &ns_root);
+ /* Find the namespace root */
+
+ guard(mount_locked_reader)();
- path_put(&fs_root);
- path_put(&ns_root);
+ root = topmost_overmount(current->nsproxy->mnt_ns->root);
- return chrooted;
+ return fs_root.mnt != &root->mnt || !path_mounted(&fs_root);
}
static bool mnt_already_visible(struct mnt_namespace *ns,
@@ -6208,9 +6132,8 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
{
int new_flags = *new_mnt_flags;
struct mount *mnt, *n;
- bool visible = false;
- down_read(&namespace_sem);
+ guard(namespace_shared)();
rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
struct mount *child;
int mnt_flags;
@@ -6257,13 +6180,10 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
/* Preserve the locked attributes */
*new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \
MNT_LOCK_ATIME);
- visible = true;
- goto found;
+ return true;
next: ;
}
-found:
- up_read(&namespace_sem);
- return visible;
+ return false;
}
static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags)
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 5d6edafbed20..0e4c67373e4f 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -676,7 +676,7 @@ bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr,
struct pnfs_layout_segment *lseg;
struct xdr_buf buf;
struct xdr_stream xdr;
- struct page *scratch;
+ struct folio *scratch;
int status, i;
uint32_t count;
__be32 *p;
@@ -689,13 +689,13 @@ bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr,
return ERR_PTR(-ENOMEM);
status = -ENOMEM;
- scratch = alloc_page(gfp_mask);
+ scratch = folio_alloc(gfp_mask, 0);
if (!scratch)
goto out;
xdr_init_decode_pages(&xdr, &buf,
lgr->layoutp->pages, lgr->layoutp->len);
- xdr_set_scratch_page(&xdr, scratch);
+ xdr_set_scratch_folio(&xdr, scratch);
status = -EIO;
p = xdr_inline_decode(&xdr, 4);
@@ -744,7 +744,7 @@ process_extents:
}
out_free_scratch:
- __free_page(scratch);
+ folio_put(scratch);
out:
dprintk("%s returns %d\n", __func__, status);
switch (status) {
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index 44306ac22353..ab76120705e2 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -541,16 +541,16 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
struct pnfs_block_dev *top;
struct xdr_stream xdr;
struct xdr_buf buf;
- struct page *scratch;
+ struct folio *scratch;
int nr_volumes, ret, i;
__be32 *p;
- scratch = alloc_page(gfp_mask);
+ scratch = folio_alloc(gfp_mask, 0);
if (!scratch)
goto out;
xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen);
- xdr_set_scratch_page(&xdr, scratch);
+ xdr_set_scratch_folio(&xdr, scratch);
p = xdr_inline_decode(&xdr, sizeof(__be32));
if (!p)
@@ -582,7 +582,7 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
out_free_volumes:
kfree(volumes);
out_free_scratch:
- __free_page(scratch);
+ folio_put(scratch);
out:
return node;
}
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 86bdc7d23fb9..c8b837006bb2 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -136,7 +136,7 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc
return;
dprintk("NFS: destroy per-net callback data; net=%x\n", net->ns.inum);
- svc_xprt_destroy_all(serv, net);
+ svc_xprt_destroy_all(serv, net, false);
}
static int nfs_callback_up_net(int minorversion, struct svc_serv *serv,
@@ -153,7 +153,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv,
ret = svc_bind(serv, net);
if (ret < 0) {
printk(KERN_WARNING "NFS: bind callback service failed\n");
- goto err_bind;
+ goto err;
}
ret = 0;
@@ -166,13 +166,11 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv,
if (ret < 0) {
printk(KERN_ERR "NFS: callback service start failed\n");
- goto err_socks;
+ goto err;
}
return 0;
-err_socks:
- svc_rpcb_cleanup(serv, net);
-err_bind:
+err:
nn->cb_users[minorversion]--;
dprintk("NFS: Couldn't create callback socket: err = %d; "
"net = %x\n", ret, net->ns.inum);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index d81217923936..46d9c65d50f8 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -829,17 +829,17 @@ static int nfs_readdir_folio_filler(struct nfs_readdir_descriptor *desc,
struct address_space *mapping = desc->file->f_mapping;
struct folio *new, *folio = *arrays;
struct xdr_stream stream;
- struct page *scratch;
+ struct folio *scratch;
struct xdr_buf buf;
u64 cookie;
int status;
- scratch = alloc_page(GFP_KERNEL);
+ scratch = folio_alloc(GFP_KERNEL, 0);
if (scratch == NULL)
return -ENOMEM;
xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen);
- xdr_set_scratch_page(&stream, scratch);
+ xdr_set_scratch_folio(&stream, scratch);
do {
status = nfs_readdir_entry_decode(desc, entry, &stream);
@@ -891,7 +891,7 @@ static int nfs_readdir_folio_filler(struct nfs_readdir_descriptor *desc,
if (folio != *arrays)
nfs_readdir_folio_unlock_and_put(folio);
- put_page(scratch);
+ folio_put(scratch);
return status;
}
@@ -2198,8 +2198,6 @@ no_open:
else
dput(dentry);
}
- if (IS_ERR(res))
- return PTR_ERR(res);
return finish_no_open(file, res);
}
EXPORT_SYMBOL_GPL(nfs_atomic_open);
@@ -2260,7 +2258,7 @@ int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry,
struct file *file, unsigned int open_flags,
umode_t mode)
{
-
+ struct dentry *res = NULL;
/* Same as look+open from lookup_open(), but with different O_TRUNC
* handling.
*/
@@ -2275,21 +2273,15 @@ int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry,
if (error)
return error;
return finish_open(file, dentry, NULL);
- } else if (d_in_lookup(dentry)) {
+ }
+ if (d_in_lookup(dentry)) {
/* The only flags nfs_lookup considers are
* LOOKUP_EXCL and LOOKUP_RENAME_TARGET, and
* we want those to be zero so the lookup isn't skipped.
*/
- struct dentry *res = nfs_lookup(dir, dentry, 0);
-
- d_lookup_done(dentry);
- if (unlikely(res)) {
- if (IS_ERR(res))
- return PTR_ERR(res);
- return finish_no_open(file, res);
- }
+ res = nfs_lookup(dir, dentry, 0);
}
- return finish_no_open(file, NULL);
+ return finish_no_open(file, res);
}
EXPORT_SYMBOL_GPL(nfs_atomic_open_v23);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 8059ece82468..d020aab40c64 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -161,6 +161,8 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t result;
+ trace_nfs_file_read(iocb, to);
+
if (iocb->ki_flags & IOCB_DIRECT)
return nfs_file_direct_read(iocb, to, false);
@@ -361,6 +363,8 @@ static bool nfs_want_read_modify_write(struct file *file, struct folio *folio,
if (pnfs_ld_read_whole_page(file_inode(file)))
return true;
+ if (folio_test_dropbehind(folio))
+ return false;
/* Open for reading too? */
if (file->f_mode & FMODE_READ)
return true;
@@ -380,22 +384,23 @@ static int nfs_write_begin(const struct kiocb *iocb,
loff_t pos, unsigned len, struct folio **foliop,
void **fsdata)
{
- fgf_t fgp = FGP_WRITEBEGIN;
struct folio *folio;
struct file *file = iocb->ki_filp;
int once_thru = 0;
int ret;
+ trace_nfs_write_begin(file_inode(file), pos, len);
+
dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n",
file, mapping->host->i_ino, len, (long long) pos);
nfs_truncate_last_folio(mapping, i_size_read(mapping->host), pos);
- fgp |= fgf_set_order(len);
start:
- folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp,
- mapping_gfp_mask(mapping));
- if (IS_ERR(folio))
- return PTR_ERR(folio);
+ folio = write_begin_get_folio(iocb, mapping, pos >> PAGE_SHIFT, len);
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
+ goto out;
+ }
*foliop = folio;
ret = nfs_flush_incompatible(file, folio);
@@ -405,11 +410,14 @@ start:
} else if (!once_thru &&
nfs_want_read_modify_write(file, folio, pos, len)) {
once_thru = 1;
+ folio_clear_dropbehind(folio);
ret = nfs_read_folio(file, folio);
folio_put(folio);
if (!ret)
goto start;
}
+out:
+ trace_nfs_write_begin_done(file_inode(file), pos, len, ret);
return ret;
}
@@ -423,6 +431,7 @@ static int nfs_write_end(const struct kiocb *iocb,
unsigned offset = offset_in_folio(folio, pos);
int status;
+ trace_nfs_write_end(file_inode(file), pos, len);
dfprintk(PAGECACHE, "NFS: write_end(%pD2(%lu), %u@%lld)\n",
file, mapping->host->i_ino, len, (long long) pos);
@@ -451,13 +460,16 @@ static int nfs_write_end(const struct kiocb *iocb,
folio_unlock(folio);
folio_put(folio);
- if (status < 0)
+ if (status < 0) {
+ trace_nfs_write_end_done(file_inode(file), pos, len, status);
return status;
+ }
NFS_I(mapping->host)->write_io += copied;
if (nfs_ctx_key_to_expire(ctx, mapping->host))
nfs_wb_all(mapping->host);
+ trace_nfs_write_end_done(file_inode(file), pos, len, copied);
return copied;
}
@@ -690,6 +702,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
errseq_t since;
int error;
+ trace_nfs_file_write(iocb, from);
+
result = nfs_key_timeout_notify(file, inode);
if (result)
return result;
@@ -949,5 +963,6 @@ const struct file_operations nfs_file_operations = {
.splice_write = iter_file_splice_write,
.check_flags = nfs_check_flags,
.setlease = simple_nosetlease,
+ .fop_flags = FOP_DONTCACHE,
};
EXPORT_SYMBOL_GPL(nfs_file_operations);
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index d39a1f58e18d..5c4551117c58 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -646,19 +646,19 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
{
struct xdr_stream stream;
struct xdr_buf buf;
- struct page *scratch;
+ struct folio *scratch;
__be32 *p;
uint32_t nfl_util;
int i;
dprintk("%s: set_layout_map Begin\n", __func__);
- scratch = alloc_page(gfp_flags);
+ scratch = folio_alloc(gfp_flags, 0);
if (!scratch)
return -ENOMEM;
xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
- xdr_set_scratch_page(&stream, scratch);
+ xdr_set_scratch_folio(&stream, scratch);
/* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8),
* num_fh (4) */
@@ -724,11 +724,11 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
fl->fh_array[i]->size);
}
- __free_page(scratch);
+ folio_put(scratch);
return 0;
out_err:
- __free_page(scratch);
+ folio_put(scratch);
return -EIO;
}
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index 29d9234d5c08..df79aeb68db4 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -73,18 +73,18 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
struct nfs4_file_layout_dsaddr *dsaddr = NULL;
struct xdr_stream stream;
struct xdr_buf buf;
- struct page *scratch;
+ struct folio *scratch;
struct list_head dsaddrs;
struct nfs4_pnfs_ds_addr *da;
struct net *net = server->nfs_client->cl_net;
/* set up xdr stream */
- scratch = alloc_page(gfp_flags);
+ scratch = folio_alloc(gfp_flags, 0);
if (!scratch)
goto out_err;
xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
- xdr_set_scratch_page(&stream, scratch);
+ xdr_set_scratch_folio(&stream, scratch);
/* Get the stripe count (number of stripe index) */
p = xdr_inline_decode(&stream, 4);
@@ -186,7 +186,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
}
}
- __free_page(scratch);
+ folio_put(scratch);
return dsaddr;
out_err_drain_dsaddrs:
@@ -204,7 +204,7 @@ out_err_free_deviceid:
out_err_free_stripe_indices:
kfree(stripe_indices);
out_err_free_scratch:
- __free_page(scratch);
+ folio_put(scratch);
out_err:
dprintk("%s ERROR: returning NULL\n", __func__);
return NULL;
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 9edb5f9b0c4e..df01d2876b68 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -47,7 +47,7 @@ ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
int dev_limit, enum nfs4_ff_op_type type);
static void ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr,
const struct nfs42_layoutstat_devinfo *devinfo,
- struct nfs4_ff_layout_mirror *mirror);
+ struct nfs4_ff_layout_ds_stripe *dss_info);
static struct pnfs_layout_hdr *
ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
@@ -164,31 +164,32 @@ decode_name(struct xdr_stream *xdr, u32 *id)
}
static struct nfsd_file *
-ff_local_open_fh(struct pnfs_layout_segment *lseg, u32 ds_idx,
+ff_local_open_fh(struct pnfs_layout_segment *lseg, u32 ds_idx, u32 dss_id,
struct nfs_client *clp, const struct cred *cred,
struct nfs_fh *fh, fmode_t mode)
{
#if IS_ENABLED(CONFIG_NFS_LOCALIO)
struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
- return nfs_local_open_fh(clp, cred, fh, &mirror->nfl, mode);
+ return nfs_local_open_fh(clp, cred, fh, &mirror->dss[dss_id].nfl, mode);
#else
return NULL;
#endif
}
-static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
- const struct nfs4_ff_layout_mirror *m2)
+static bool ff_dss_match_fh(const struct nfs4_ff_layout_ds_stripe *dss1,
+ const struct nfs4_ff_layout_ds_stripe *dss2)
{
int i, j;
- if (m1->fh_versions_cnt != m2->fh_versions_cnt)
+ if (dss1->fh_versions_cnt != dss2->fh_versions_cnt)
return false;
- for (i = 0; i < m1->fh_versions_cnt; i++) {
+
+ for (i = 0; i < dss1->fh_versions_cnt; i++) {
bool found_fh = false;
- for (j = 0; j < m2->fh_versions_cnt; j++) {
- if (nfs_compare_fh(&m1->fh_versions[i],
- &m2->fh_versions[j]) == 0) {
+ for (j = 0; j < dss2->fh_versions_cnt; j++) {
+ if (nfs_compare_fh(&dss1->fh_versions[i],
+ &dss2->fh_versions[j]) == 0) {
found_fh = true;
break;
}
@@ -199,6 +200,38 @@ static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
return true;
}
+static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
+ const struct nfs4_ff_layout_mirror *m2)
+{
+ u32 dss_id;
+
+ if (m1->dss_count != m2->dss_count)
+ return false;
+
+ for (dss_id = 0; dss_id < m1->dss_count; dss_id++)
+ if (!ff_dss_match_fh(&m1->dss[dss_id], &m2->dss[dss_id]))
+ return false;
+
+ return true;
+}
+
+static bool ff_mirror_match_devid(const struct nfs4_ff_layout_mirror *m1,
+ const struct nfs4_ff_layout_mirror *m2)
+{
+ u32 dss_id;
+
+ if (m1->dss_count != m2->dss_count)
+ return false;
+
+ for (dss_id = 0; dss_id < m1->dss_count; dss_id++)
+ if (memcmp(&m1->dss[dss_id].devid,
+ &m2->dss[dss_id].devid,
+ sizeof(m1->dss[dss_id].devid)) != 0)
+ return false;
+
+ return true;
+}
+
static struct nfs4_ff_layout_mirror *
ff_layout_add_mirror(struct pnfs_layout_hdr *lo,
struct nfs4_ff_layout_mirror *mirror)
@@ -209,7 +242,7 @@ ff_layout_add_mirror(struct pnfs_layout_hdr *lo,
spin_lock(&inode->i_lock);
list_for_each_entry(pos, &ff_layout->mirrors, mirrors) {
- if (memcmp(&mirror->devid, &pos->devid, sizeof(pos->devid)) != 0)
+ if (!ff_mirror_match_devid(mirror, pos))
continue;
if (!ff_mirror_match_fh(mirror, pos))
continue;
@@ -240,29 +273,37 @@ ff_layout_remove_mirror(struct nfs4_ff_layout_mirror *mirror)
static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
{
struct nfs4_ff_layout_mirror *mirror;
+ u32 dss_id;
mirror = kzalloc(sizeof(*mirror), gfp_flags);
if (mirror != NULL) {
spin_lock_init(&mirror->lock);
refcount_set(&mirror->ref, 1);
INIT_LIST_HEAD(&mirror->mirrors);
- nfs_localio_file_init(&mirror->nfl);
+ for (dss_id = 0; dss_id < mirror->dss_count; dss_id++)
+ nfs_localio_file_init(&mirror->dss[dss_id].nfl);
}
return mirror;
}
static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
{
- const struct cred *cred;
+ const struct cred *cred;
+ u32 dss_id;
ff_layout_remove_mirror(mirror);
- kfree(mirror->fh_versions);
- nfs_close_local_fh(&mirror->nfl);
- cred = rcu_access_pointer(mirror->ro_cred);
- put_cred(cred);
- cred = rcu_access_pointer(mirror->rw_cred);
- put_cred(cred);
- nfs4_ff_layout_put_deviceid(mirror->mirror_ds);
+
+ for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) {
+ kfree(mirror->dss[dss_id].fh_versions);
+ cred = rcu_access_pointer(mirror->dss[dss_id].ro_cred);
+ put_cred(cred);
+ cred = rcu_access_pointer(mirror->dss[dss_id].rw_cred);
+ put_cred(cred);
+ nfs_close_local_fh(&mirror->dss[dss_id].nfl);
+ nfs4_ff_layout_put_deviceid(mirror->dss[dss_id].mirror_ds);
+ }
+
+ kfree(mirror->dss);
kfree(mirror);
}
@@ -366,14 +407,24 @@ ff_layout_add_lseg(struct pnfs_layout_hdr *lo,
free_me);
}
+static u32 ff_mirror_efficiency_sum(const struct nfs4_ff_layout_mirror *mirror)
+{
+ u32 dss_id, sum = 0;
+
+ for (dss_id = 0; dss_id < mirror->dss_count; dss_id++)
+ sum += mirror->dss[dss_id].efficiency;
+
+ return sum;
+}
+
static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
{
int i, j;
for (i = 0; i < fls->mirror_array_cnt - 1; i++) {
for (j = i + 1; j < fls->mirror_array_cnt; j++)
- if (fls->mirror_array[i]->efficiency <
- fls->mirror_array[j]->efficiency)
+ if (ff_mirror_efficiency_sum(fls->mirror_array[i]) <
+ ff_mirror_efficiency_sum(fls->mirror_array[j]))
swap(fls->mirror_array[i],
fls->mirror_array[j]);
}
@@ -388,20 +439,21 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
struct nfs4_ff_layout_segment *fls = NULL;
struct xdr_stream stream;
struct xdr_buf buf;
- struct page *scratch;
+ struct folio *scratch;
u64 stripe_unit;
u32 mirror_array_cnt;
__be32 *p;
int i, rc;
+ struct nfs4_ff_layout_ds_stripe *dss_info;
dprintk("--> %s\n", __func__);
- scratch = alloc_page(gfp_flags);
+ scratch = folio_alloc(gfp_flags, 0);
if (!scratch)
return ERR_PTR(-ENOMEM);
xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages,
lgr->layoutp->len);
- xdr_set_scratch_page(&stream, scratch);
+ xdr_set_scratch_folio(&stream, scratch);
/* stripe unit and mirror_array_cnt */
rc = -EIO;
@@ -427,23 +479,32 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
fls->mirror_array_cnt = mirror_array_cnt;
fls->stripe_unit = stripe_unit;
+ u32 dss_count = 0;
for (i = 0; i < fls->mirror_array_cnt; i++) {
struct nfs4_ff_layout_mirror *mirror;
struct cred *kcred;
const struct cred __rcu *cred;
kuid_t uid;
kgid_t gid;
- u32 ds_count, fh_count, id;
- int j;
+ u32 fh_count, id;
+ int j, dss_id;
rc = -EIO;
p = xdr_inline_decode(&stream, 4);
if (!p)
goto out_err_free;
- ds_count = be32_to_cpup(p);
- /* FIXME: allow for striping? */
- if (ds_count != 1)
+ // Ensure all mirrors have same stripe count.
+ if (dss_count == 0)
+ dss_count = be32_to_cpup(p);
+ else if (dss_count != be32_to_cpup(p))
+ goto out_err_free;
+
+ if (dss_count > NFS4_FLEXFILE_LAYOUT_MAX_STRIPE_CNT ||
+ dss_count == 0)
+ goto out_err_free;
+
+ if (dss_count > 1 && stripe_unit == 0)
goto out_err_free;
fls->mirror_array[i] = ff_layout_alloc_mirror(gfp_flags);
@@ -452,91 +513,105 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
goto out_err_free;
}
- fls->mirror_array[i]->ds_count = ds_count;
+ fls->mirror_array[i]->dss_count = dss_count;
+ fls->mirror_array[i]->dss =
+ kcalloc(dss_count, sizeof(struct nfs4_ff_layout_ds_stripe),
+ gfp_flags);
- /* deviceid */
- rc = decode_deviceid(&stream, &fls->mirror_array[i]->devid);
- if (rc)
- goto out_err_free;
+ for (dss_id = 0; dss_id < dss_count; dss_id++) {
+ dss_info = &fls->mirror_array[i]->dss[dss_id];
+ dss_info->mirror = fls->mirror_array[i];
- /* efficiency */
- rc = -EIO;
- p = xdr_inline_decode(&stream, 4);
- if (!p)
- goto out_err_free;
- fls->mirror_array[i]->efficiency = be32_to_cpup(p);
+ /* deviceid */
+ rc = decode_deviceid(&stream, &dss_info->devid);
+ if (rc)
+ goto out_err_free;
- /* stateid */
- rc = decode_pnfs_stateid(&stream, &fls->mirror_array[i]->stateid);
- if (rc)
- goto out_err_free;
+ /* efficiency */
+ rc = -EIO;
+ p = xdr_inline_decode(&stream, 4);
+ if (!p)
+ goto out_err_free;
+ dss_info->efficiency = be32_to_cpup(p);
- /* fh */
- rc = -EIO;
- p = xdr_inline_decode(&stream, 4);
- if (!p)
- goto out_err_free;
- fh_count = be32_to_cpup(p);
+ /* stateid */
+ rc = decode_pnfs_stateid(&stream, &dss_info->stateid);
+ if (rc)
+ goto out_err_free;
- fls->mirror_array[i]->fh_versions =
- kcalloc(fh_count, sizeof(struct nfs_fh),
- gfp_flags);
- if (fls->mirror_array[i]->fh_versions == NULL) {
- rc = -ENOMEM;
- goto out_err_free;
- }
+ /* fh */
+ rc = -EIO;
+ p = xdr_inline_decode(&stream, 4);
+ if (!p)
+ goto out_err_free;
+ fh_count = be32_to_cpup(p);
- for (j = 0; j < fh_count; j++) {
- rc = decode_nfs_fh(&stream,
- &fls->mirror_array[i]->fh_versions[j]);
+ dss_info->fh_versions =
+ kcalloc(fh_count, sizeof(struct nfs_fh),
+ gfp_flags);
+ if (dss_info->fh_versions == NULL) {
+ rc = -ENOMEM;
+ goto out_err_free;
+ }
+
+ for (j = 0; j < fh_count; j++) {
+ rc = decode_nfs_fh(&stream,
+ &dss_info->fh_versions[j]);
+ if (rc)
+ goto out_err_free;
+ }
+
+ dss_info->fh_versions_cnt = fh_count;
+
+ /* user */
+ rc = decode_name(&stream, &id);
if (rc)
goto out_err_free;
- }
- fls->mirror_array[i]->fh_versions_cnt = fh_count;
+ uid = make_kuid(&init_user_ns, id);
- /* user */
- rc = decode_name(&stream, &id);
- if (rc)
- goto out_err_free;
+ /* group */
+ rc = decode_name(&stream, &id);
+ if (rc)
+ goto out_err_free;
- uid = make_kuid(&init_user_ns, id);
+ gid = make_kgid(&init_user_ns, id);
- /* group */
- rc = decode_name(&stream, &id);
- if (rc)
- goto out_err_free;
+ if (gfp_flags & __GFP_FS)
+ kcred = prepare_kernel_cred(&init_task);
+ else {
+ unsigned int nofs_flags = memalloc_nofs_save();
- gid = make_kgid(&init_user_ns, id);
+ kcred = prepare_kernel_cred(&init_task);
+ memalloc_nofs_restore(nofs_flags);
+ }
+ rc = -ENOMEM;
+ if (!kcred)
+ goto out_err_free;
+ kcred->fsuid = uid;
+ kcred->fsgid = gid;
+ cred = RCU_INITIALIZER(kcred);
- if (gfp_flags & __GFP_FS)
- kcred = prepare_kernel_cred(&init_task);
- else {
- unsigned int nofs_flags = memalloc_nofs_save();
- kcred = prepare_kernel_cred(&init_task);
- memalloc_nofs_restore(nofs_flags);
+ if (lgr->range.iomode == IOMODE_READ)
+ rcu_assign_pointer(dss_info->ro_cred, cred);
+ else
+ rcu_assign_pointer(dss_info->rw_cred, cred);
}
- rc = -ENOMEM;
- if (!kcred)
- goto out_err_free;
- kcred->fsuid = uid;
- kcred->fsgid = gid;
- cred = RCU_INITIALIZER(kcred);
-
- if (lgr->range.iomode == IOMODE_READ)
- rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
- else
- rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred);
mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]);
if (mirror != fls->mirror_array[i]) {
- /* swap cred ptrs so free_mirror will clean up old */
- if (lgr->range.iomode == IOMODE_READ) {
- cred = xchg(&mirror->ro_cred, cred);
- rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
- } else {
- cred = xchg(&mirror->rw_cred, cred);
- rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred);
+ for (dss_id = 0; dss_id < dss_count; dss_id++) {
+ dss_info = &fls->mirror_array[i]->dss[dss_id];
+ /* swap cred ptrs so free_mirror will clean up old */
+ if (lgr->range.iomode == IOMODE_READ) {
+ cred = xchg(&mirror->dss[dss_id].ro_cred,
+ dss_info->ro_cred);
+ rcu_assign_pointer(dss_info->ro_cred, cred);
+ } else {
+ cred = xchg(&mirror->dss[dss_id].rw_cred,
+ dss_info->rw_cred);
+ rcu_assign_pointer(dss_info->rw_cred, cred);
+ }
}
ff_layout_free_mirror(fls->mirror_array[i]);
fls->mirror_array[i] = mirror;
@@ -564,7 +639,7 @@ out_sort_mirrors:
ret = &fls->generic_hdr;
dprintk("<-- %s (success)\n", __func__);
out_free_page:
- __free_page(scratch);
+ folio_put(scratch);
return ret;
out_err_free:
_ff_layout_free_lseg(fls);
@@ -593,6 +668,26 @@ ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
_ff_layout_free_lseg(fls);
}
+static u32 calc_commit_idx(struct pnfs_layout_segment *lseg,
+ u32 mirror_idx, u32 dss_id)
+{
+ struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
+
+ return (mirror_idx * flseg->mirror_array[0]->dss_count) + dss_id;
+}
+
+static u32 calc_mirror_idx_from_commit(struct pnfs_layout_segment *lseg,
+ u32 commit_index)
+{
+ return commit_index / FF_LAYOUT_LSEG(lseg)->mirror_array[0]->dss_count;
+}
+
+static u32 calc_dss_id_from_commit(struct pnfs_layout_segment *lseg,
+ u32 commit_index)
+{
+ return commit_index % FF_LAYOUT_LSEG(lseg)->mirror_array[0]->dss_count;
+}
+
static void
nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
{
@@ -617,6 +712,7 @@ nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
static bool
nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
+ u32 dss_id,
struct nfs4_ff_layoutstat *layoutstat,
ktime_t now)
{
@@ -624,8 +720,8 @@ nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(mirror->layout);
nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now);
- if (!mirror->start_time)
- mirror->start_time = now;
+ if (!mirror->dss[dss_id].start_time)
+ mirror->dss[dss_id].start_time = now;
if (mirror->report_interval != 0)
report_interval = (s64)mirror->report_interval * 1000LL;
else if (layoutstats_timer != 0)
@@ -675,13 +771,16 @@ nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat,
static void
nfs4_ff_layout_stat_io_start_read(struct inode *inode,
struct nfs4_ff_layout_mirror *mirror,
+ u32 dss_id,
__u64 requested, ktime_t now)
{
bool report;
spin_lock(&mirror->lock);
- report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat, now);
- nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested);
+ report = nfs4_ff_layoutstat_start_io(
+ mirror, dss_id, &mirror->dss[dss_id].read_stat, now);
+ nfs4_ff_layout_stat_io_update_requested(
+ &mirror->dss[dss_id].read_stat, requested);
set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
spin_unlock(&mirror->lock);
@@ -692,11 +791,12 @@ nfs4_ff_layout_stat_io_start_read(struct inode *inode,
static void
nfs4_ff_layout_stat_io_end_read(struct rpc_task *task,
struct nfs4_ff_layout_mirror *mirror,
+ u32 dss_id,
__u64 requested,
__u64 completed)
{
spin_lock(&mirror->lock);
- nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat,
+ nfs4_ff_layout_stat_io_update_completed(&mirror->dss[dss_id].read_stat,
requested, completed,
ktime_get(), task->tk_start);
set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
@@ -706,13 +806,20 @@ nfs4_ff_layout_stat_io_end_read(struct rpc_task *task,
static void
nfs4_ff_layout_stat_io_start_write(struct inode *inode,
struct nfs4_ff_layout_mirror *mirror,
+ u32 dss_id,
__u64 requested, ktime_t now)
{
bool report;
spin_lock(&mirror->lock);
- report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat, now);
- nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested);
+ report = nfs4_ff_layoutstat_start_io(
+ mirror,
+ dss_id,
+ &mirror->dss[dss_id].write_stat,
+ now);
+ nfs4_ff_layout_stat_io_update_requested(
+ &mirror->dss[dss_id].write_stat,
+ requested);
set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
spin_unlock(&mirror->lock);
@@ -723,6 +830,7 @@ nfs4_ff_layout_stat_io_start_write(struct inode *inode,
static void
nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
struct nfs4_ff_layout_mirror *mirror,
+ u32 dss_id,
__u64 requested,
__u64 completed,
enum nfs3_stable_how committed)
@@ -731,25 +839,25 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
requested = completed = 0;
spin_lock(&mirror->lock);
- nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat,
+ nfs4_ff_layout_stat_io_update_completed(&mirror->dss[dss_id].write_stat,
requested, completed, ktime_get(), task->tk_start);
set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
spin_unlock(&mirror->lock);
}
static void
-ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, u32 idx)
+ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, u32 idx, u32 dss_id)
{
- struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+ struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id);
if (devid)
nfs4_mark_deviceid_unavailable(devid);
}
static void
-ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx)
+ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx, u32 dss_id)
{
- struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+ struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id);
if (devid)
nfs4_mark_deviceid_available(devid);
@@ -758,6 +866,7 @@ ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx)
static struct nfs4_pnfs_ds *
ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
u32 start_idx, u32 *best_idx,
+ u32 offset, u32 *dss_id,
bool check_device)
{
struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
@@ -768,12 +877,16 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
/* mirrors are initially sorted by efficiency */
for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) {
mirror = FF_LAYOUT_COMP(lseg, idx);
- ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false);
+ *dss_id = nfs4_ff_layout_calc_dss_id(
+ fls->stripe_unit,
+ fls->mirror_array[idx]->dss_count,
+ offset);
+ ds = nfs4_ff_layout_prepare_ds(lseg, mirror, *dss_id, false);
if (IS_ERR(ds))
continue;
if (check_device &&
- nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node)) {
+ nfs4_test_deviceid_unavailable(&mirror->dss[*dss_id].mirror_ds->id_node)) {
// reinitialize the error state in case if this is the last iteration
ds = ERR_PTR(-EINVAL);
continue;
@@ -788,42 +901,52 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
static struct nfs4_pnfs_ds *
ff_layout_choose_any_ds_for_read(struct pnfs_layout_segment *lseg,
- u32 start_idx, u32 *best_idx)
+ u32 start_idx, u32 *best_idx,
+ u32 offset, u32 *dss_id)
{
- return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, false);
+ return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx,
+ offset, dss_id, false);
}
static struct nfs4_pnfs_ds *
ff_layout_choose_valid_ds_for_read(struct pnfs_layout_segment *lseg,
- u32 start_idx, u32 *best_idx)
+ u32 start_idx, u32 *best_idx,
+ u32 offset, u32 *dss_id)
{
- return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, true);
+ return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx,
+ offset, dss_id, true);
}
static struct nfs4_pnfs_ds *
ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
- u32 start_idx, u32 *best_idx)
+ u32 start_idx, u32 *best_idx,
+ u32 offset, u32 *dss_id)
{
struct nfs4_pnfs_ds *ds;
- ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx);
+ ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx,
+ offset, dss_id);
if (!IS_ERR(ds))
return ds;
- return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx);
+ return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx,
+ offset, dss_id);
}
static struct nfs4_pnfs_ds *
ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio,
- u32 *best_idx)
+ u32 *best_idx,
+ u32 offset,
+ u32 *dss_id)
{
struct pnfs_layout_segment *lseg = pgio->pg_lseg;
struct nfs4_pnfs_ds *ds;
ds = ff_layout_choose_best_ds_for_read(lseg, pgio->pg_mirror_idx,
- best_idx);
+ best_idx, offset, dss_id);
if (!IS_ERR(ds) || !pgio->pg_mirror_idx)
return ds;
- return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx);
+ return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx,
+ offset, dss_id);
}
static void
@@ -842,6 +965,56 @@ ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
}
}
+static bool
+ff_layout_lseg_is_striped(const struct nfs4_ff_layout_segment *fls)
+{
+ return fls->mirror_array[0]->dss_count > 1;
+}
+
+/*
+ * ff_layout_pg_test(). Called by nfs_can_coalesce_requests()
+ *
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+static size_t
+ff_layout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+ struct nfs_page *req)
+{
+ unsigned int size;
+ u64 p_stripe, r_stripe;
+ u32 stripe_offset;
+ u64 segment_offset = pgio->pg_lseg->pls_range.offset;
+ u32 stripe_unit = FF_LAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
+
+ /* calls nfs_generic_pg_test */
+ size = pnfs_generic_pg_test(pgio, prev, req);
+ if (!size)
+ return 0;
+ else if (!ff_layout_lseg_is_striped(FF_LAYOUT_LSEG(pgio->pg_lseg)))
+ return size;
+
+ /* see if req and prev are in the same stripe */
+ if (prev) {
+ p_stripe = (u64)req_offset(prev) - segment_offset;
+ r_stripe = (u64)req_offset(req) - segment_offset;
+ do_div(p_stripe, stripe_unit);
+ do_div(r_stripe, stripe_unit);
+
+ if (p_stripe != r_stripe)
+ return 0;
+ }
+
+ /* calculate remaining bytes in the current stripe */
+ div_u64_rem((u64)req_offset(req) - segment_offset,
+ stripe_unit,
+ &stripe_offset);
+ WARN_ON_ONCE(stripe_offset > stripe_unit);
+ if (stripe_offset >= stripe_unit)
+ return 0;
+ return min(stripe_unit - (unsigned int)stripe_offset, size);
+}
+
static void
ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
@@ -849,7 +1022,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs_pgio_mirror *pgm;
struct nfs4_ff_layout_mirror *mirror;
struct nfs4_pnfs_ds *ds;
- u32 ds_idx;
+ u32 ds_idx, dss_id;
if (NFS_SERVER(pgio->pg_inode)->flags &
(NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR))
@@ -870,7 +1043,8 @@ retry:
/* Reset wb_nio, since getting layout segment was successful */
req->wb_nio = 0;
- ds = ff_layout_get_ds_for_read(pgio, &ds_idx);
+ ds = ff_layout_get_ds_for_read(pgio, &ds_idx,
+ req_offset(req), &dss_id);
if (IS_ERR(ds)) {
if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
goto out_mds;
@@ -882,7 +1056,7 @@ retry:
mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
pgm = &pgio->pg_mirrors[0];
- pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize;
+ pgm->pg_bsize = mirror->dss[dss_id].mirror_ds->ds_versions[0].rsize;
pgio->pg_mirror_idx = ds_idx;
return;
@@ -919,7 +1093,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs4_ff_layout_mirror *mirror;
struct nfs_pgio_mirror *pgm;
struct nfs4_pnfs_ds *ds;
- u32 i;
+ u32 i, dss_id;
retry:
pnfs_generic_pg_check_layout(pgio, req);
@@ -944,7 +1118,12 @@ retry:
for (i = 0; i < pgio->pg_mirror_count; i++) {
mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
- ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, mirror, true);
+ dss_id = nfs4_ff_layout_calc_dss_id(
+ FF_LAYOUT_LSEG(pgio->pg_lseg)->stripe_unit,
+ mirror->dss_count,
+ req_offset(req));
+ ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, mirror,
+ dss_id, true);
if (IS_ERR(ds)) {
if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
goto out_mds;
@@ -954,7 +1133,7 @@ retry:
goto retry;
}
pgm = &pgio->pg_mirrors[i];
- pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize;
+ pgm->pg_bsize = mirror->dss[dss_id].mirror_ds->ds_versions[0].wsize;
}
if (NFS_SERVER(pgio->pg_inode)->flags &
@@ -1020,14 +1199,14 @@ ff_layout_pg_get_mirror_write(struct nfs_pageio_descriptor *desc, u32 idx)
static const struct nfs_pageio_ops ff_layout_pg_read_ops = {
.pg_init = ff_layout_pg_init_read,
- .pg_test = pnfs_generic_pg_test,
+ .pg_test = ff_layout_pg_test,
.pg_doio = pnfs_generic_pg_readpages,
.pg_cleanup = pnfs_generic_pg_cleanup,
};
static const struct nfs_pageio_ops ff_layout_pg_write_ops = {
.pg_init = ff_layout_pg_init_write,
- .pg_test = pnfs_generic_pg_test,
+ .pg_test = ff_layout_pg_test,
.pg_doio = pnfs_generic_pg_writepages,
.pg_get_mirror_count = ff_layout_pg_get_mirror_count_write,
.pg_cleanup = pnfs_generic_pg_cleanup,
@@ -1075,9 +1254,11 @@ static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr)
{
u32 idx = hdr->pgio_mirror_idx + 1;
u32 new_idx = 0;
+ u32 dss_id = 0;
struct nfs4_pnfs_ds *ds;
- ds = ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx);
+ ds = ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx,
+ hdr->args.offset, &dss_id);
if (IS_ERR(ds))
pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg);
else
@@ -1114,11 +1295,11 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
struct nfs4_state *state,
struct nfs_client *clp,
struct pnfs_layout_segment *lseg,
- u32 idx)
+ u32 idx, u32 dss_id)
{
struct pnfs_layout_hdr *lo = lseg->pls_layout;
struct inode *inode = lo->plh_inode;
- struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+ struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id);
struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table;
switch (op_status) {
@@ -1215,9 +1396,9 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
u32 op_status,
struct nfs_client *clp,
struct pnfs_layout_segment *lseg,
- u32 idx)
+ u32 idx, u32 dss_id)
{
- struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+ struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id);
switch (op_status) {
case NFS_OK:
@@ -1281,12 +1462,12 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
struct nfs4_state *state,
struct nfs_client *clp,
struct pnfs_layout_segment *lseg,
- u32 idx)
+ u32 idx, u32 dss_id)
{
int vers = clp->cl_nfs_mod->rpc_vers->number;
if (task->tk_status >= 0) {
- ff_layout_mark_ds_reachable(lseg, idx);
+ ff_layout_mark_ds_reachable(lseg, idx, dss_id);
return 0;
}
@@ -1297,10 +1478,10 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
switch (vers) {
case 3:
return ff_layout_async_handle_error_v3(task, op_status, clp,
- lseg, idx);
+ lseg, idx, dss_id);
case 4:
return ff_layout_async_handle_error_v4(task, op_status, state,
- clp, lseg, idx);
+ clp, lseg, idx, dss_id);
default:
/* should never happen */
WARN_ON_ONCE(1);
@@ -1309,7 +1490,7 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
}
static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
- u32 idx, u64 offset, u64 length,
+ u32 idx, u32 dss_id, u64 offset, u64 length,
u32 *op_status, int opnum, int error)
{
struct nfs4_ff_layout_mirror *mirror;
@@ -1347,7 +1528,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
mirror = FF_LAYOUT_COMP(lseg, idx);
err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
- mirror, offset, length, status, opnum,
+ mirror, dss_id, offset, length, status, opnum,
nfs_io_gfp_mask());
switch (status) {
@@ -1356,7 +1537,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
case NFS4ERR_PERM:
break;
case NFS4ERR_NXIO:
- ff_layout_mark_ds_unreachable(lseg, idx);
+ ff_layout_mark_ds_unreachable(lseg, idx, dss_id);
/*
* Don't return the layout if this is a read and we still
* have layouts to try
@@ -1376,10 +1557,16 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
static int ff_layout_read_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
+ struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(hdr->lseg);
+ u32 dss_id = nfs4_ff_layout_calc_dss_id(
+ flseg->stripe_unit,
+ flseg->mirror_array[hdr->pgio_mirror_idx]->dss_count,
+ hdr->args.offset);
int err;
if (task->tk_status < 0) {
- ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
+ ff_layout_io_track_ds_error(hdr->lseg,
+ hdr->pgio_mirror_idx, dss_id,
hdr->args.offset, hdr->args.count,
&hdr->res.op_status, OP_READ,
task->tk_status);
@@ -1389,7 +1576,8 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
err = ff_layout_async_handle_error(task, hdr->res.op_status,
hdr->args.context->state,
hdr->ds_clp, hdr->lseg,
- hdr->pgio_mirror_idx);
+ hdr->pgio_mirror_idx,
+ dss_id);
trace_nfs4_pnfs_read(hdr, err);
clear_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags);
@@ -1445,23 +1633,47 @@ ff_layout_set_layoutcommit(struct inode *inode,
static void ff_layout_read_record_layoutstats_start(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
+ struct nfs4_ff_layout_mirror *mirror;
+ u32 dss_id;
+
if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags))
return;
- nfs4_ff_layout_stat_io_start_read(hdr->inode,
- FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
- hdr->args.count,
- task->tk_start);
+
+ mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx);
+ dss_id = nfs4_ff_layout_calc_dss_id(
+ FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit,
+ mirror->dss_count,
+ hdr->args.offset);
+
+ nfs4_ff_layout_stat_io_start_read(
+ hdr->inode,
+ mirror,
+ dss_id,
+ hdr->args.count,
+ task->tk_start);
}
static void ff_layout_read_record_layoutstats_done(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
+ struct nfs4_ff_layout_mirror *mirror;
+ u32 dss_id;
+
if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags))
return;
- nfs4_ff_layout_stat_io_end_read(task,
- FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
- hdr->args.count,
- hdr->res.count);
+
+ mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx);
+ dss_id = nfs4_ff_layout_calc_dss_id(
+ FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit,
+ mirror->dss_count,
+ hdr->args.offset);
+
+ nfs4_ff_layout_stat_io_end_read(
+ task,
+ mirror,
+ dss_id,
+ hdr->args.count,
+ hdr->res.count);
set_bit(NFS_LSEG_LAYOUTRETURN, &hdr->lseg->pls_flags);
}
@@ -1549,11 +1761,17 @@ static void ff_layout_read_release(void *data)
static int ff_layout_write_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
+ struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(hdr->lseg);
+ u32 dss_id = nfs4_ff_layout_calc_dss_id(
+ flseg->stripe_unit,
+ flseg->mirror_array[hdr->pgio_mirror_idx]->dss_count,
+ hdr->args.offset);
loff_t end_offs = 0;
int err;
if (task->tk_status < 0) {
- ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
+ ff_layout_io_track_ds_error(hdr->lseg,
+ hdr->pgio_mirror_idx, dss_id,
hdr->args.offset, hdr->args.count,
&hdr->res.op_status, OP_WRITE,
task->tk_status);
@@ -1563,7 +1781,8 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
err = ff_layout_async_handle_error(task, hdr->res.op_status,
hdr->args.context->state,
hdr->ds_clp, hdr->lseg,
- hdr->pgio_mirror_idx);
+ hdr->pgio_mirror_idx,
+ dss_id);
trace_nfs4_pnfs_write(hdr, err);
clear_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags);
@@ -1601,9 +1820,11 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
struct nfs_commit_data *data)
{
int err;
+ u32 idx = calc_mirror_idx_from_commit(data->lseg, data->ds_commit_index);
+ u32 dss_id = calc_dss_id_from_commit(data->lseg, data->ds_commit_index);
if (task->tk_status < 0) {
- ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index,
+ ff_layout_io_track_ds_error(data->lseg, idx, dss_id,
data->args.offset, data->args.count,
&data->res.op_status, OP_COMMIT,
task->tk_status);
@@ -1611,8 +1832,8 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
}
err = ff_layout_async_handle_error(task, data->res.op_status,
- NULL, data->ds_clp, data->lseg,
- data->ds_commit_index);
+ NULL, data->ds_clp, data->lseg, idx,
+ dss_id);
trace_nfs4_pnfs_commit_ds(data, err);
switch (err) {
@@ -1631,30 +1852,54 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
}
ff_layout_set_layoutcommit(data->inode, data->lseg, data->lwb);
-
return 0;
}
static void ff_layout_write_record_layoutstats_start(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
+ struct nfs4_ff_layout_mirror *mirror;
+ u32 dss_id;
+
if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags))
return;
- nfs4_ff_layout_stat_io_start_write(hdr->inode,
- FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
- hdr->args.count,
- task->tk_start);
+
+ mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx);
+ dss_id = nfs4_ff_layout_calc_dss_id(
+ FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit,
+ mirror->dss_count,
+ hdr->args.offset);
+
+ nfs4_ff_layout_stat_io_start_write(
+ hdr->inode,
+ mirror,
+ dss_id,
+ hdr->args.count,
+ task->tk_start);
}
static void ff_layout_write_record_layoutstats_done(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
+ struct nfs4_ff_layout_mirror *mirror;
+ u32 dss_id;
+
if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags))
return;
- nfs4_ff_layout_stat_io_end_write(task,
- FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
- hdr->args.count, hdr->res.count,
- hdr->res.verf->committed);
+
+ mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx);
+ dss_id = nfs4_ff_layout_calc_dss_id(
+ FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit,
+ mirror->dss_count,
+ hdr->args.offset);
+
+ nfs4_ff_layout_stat_io_end_write(
+ task,
+ mirror,
+ dss_id,
+ hdr->args.count,
+ hdr->res.count,
+ hdr->res.verf->committed);
set_bit(NFS_LSEG_LAYOUTRETURN, &hdr->lseg->pls_flags);
}
@@ -1737,10 +1982,16 @@ static void ff_layout_write_release(void *data)
static void ff_layout_commit_record_layoutstats_start(struct rpc_task *task,
struct nfs_commit_data *cdata)
{
+ u32 idx, dss_id;
+
if (test_and_set_bit(NFS_IOHDR_STAT, &cdata->flags))
return;
+
+ idx = calc_mirror_idx_from_commit(cdata->lseg, cdata->ds_commit_index);
+ dss_id = calc_dss_id_from_commit(cdata->lseg, cdata->ds_commit_index);
nfs4_ff_layout_stat_io_start_write(cdata->inode,
- FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
+ FF_LAYOUT_COMP(cdata->lseg, idx),
+ dss_id,
0, task->tk_start);
}
@@ -1749,6 +2000,7 @@ static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task,
{
struct nfs_page *req;
__u64 count = 0;
+ u32 idx, dss_id;
if (!test_and_clear_bit(NFS_IOHDR_STAT, &cdata->flags))
return;
@@ -1757,8 +2009,12 @@ static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task,
list_for_each_entry(req, &cdata->pages, wb_list)
count += req->wb_bytes;
}
+
+ idx = calc_mirror_idx_from_commit(cdata->lseg, cdata->ds_commit_index);
+ dss_id = calc_dss_id_from_commit(cdata->lseg, cdata->ds_commit_index);
nfs4_ff_layout_stat_io_end_write(task,
- FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
+ FF_LAYOUT_COMP(cdata->lseg, idx),
+ dss_id,
count, count, NFS_FILE_SYNC);
set_bit(NFS_LSEG_LAYOUTRETURN, &cdata->lseg->pls_flags);
}
@@ -1872,6 +2128,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
u32 idx = hdr->pgio_mirror_idx;
int vers;
struct nfs_fh *fh;
+ u32 dss_id;
bool ds_fatal_error = false;
dprintk("--> %s ino %lu pgbase %u req %zu@%llu\n",
@@ -1879,22 +2136,26 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
hdr->args.pgbase, (size_t)hdr->args.count, offset);
mirror = FF_LAYOUT_COMP(lseg, idx);
- ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false);
+ dss_id = nfs4_ff_layout_calc_dss_id(
+ FF_LAYOUT_LSEG(lseg)->stripe_unit,
+ mirror->dss_count,
+ offset);
+ ds = nfs4_ff_layout_prepare_ds(lseg, mirror, dss_id, false);
if (IS_ERR(ds)) {
ds_fatal_error = nfs_error_is_fatal(PTR_ERR(ds));
goto out_failed;
}
ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
- hdr->inode);
+ hdr->inode, dss_id);
if (IS_ERR(ds_clnt))
goto out_failed;
- ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred);
+ ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred, dss_id);
if (!ds_cred)
goto out_failed;
- vers = nfs4_ff_layout_ds_version(mirror);
+ vers = nfs4_ff_layout_ds_version(mirror, dss_id);
dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count), vers);
@@ -1902,11 +2163,11 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
hdr->pgio_done_cb = ff_layout_read_done_cb;
refcount_inc(&ds->ds_clp->cl_count);
hdr->ds_clp = ds->ds_clp;
- fh = nfs4_ff_layout_select_ds_fh(mirror);
+ fh = nfs4_ff_layout_select_ds_fh(mirror, dss_id);
if (fh)
hdr->args.fh = fh;
- nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid);
+ nfs4_ff_layout_select_ds_stateid(mirror, dss_id, &hdr->args.stateid);
/*
* Note that if we ever decide to split across DSes,
@@ -1916,7 +2177,8 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
hdr->mds_offset = offset;
/* Start IO accounting for local read */
- localio = ff_local_open_fh(lseg, idx, ds->ds_clp, ds_cred, fh, FMODE_READ);
+ localio = ff_local_open_fh(lseg, idx, dss_id, ds->ds_clp, ds_cred, fh,
+ FMODE_READ);
if (localio) {
hdr->task.tk_start = ktime_get();
ff_layout_read_record_layoutstats_start(&hdr->task, hdr);
@@ -1953,25 +2215,30 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
int vers;
struct nfs_fh *fh;
u32 idx = hdr->pgio_mirror_idx;
+ u32 dss_id;
bool ds_fatal_error = false;
mirror = FF_LAYOUT_COMP(lseg, idx);
- ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true);
+ dss_id = nfs4_ff_layout_calc_dss_id(
+ FF_LAYOUT_LSEG(lseg)->stripe_unit,
+ mirror->dss_count,
+ offset);
+ ds = nfs4_ff_layout_prepare_ds(lseg, mirror, dss_id, true);
if (IS_ERR(ds)) {
ds_fatal_error = nfs_error_is_fatal(PTR_ERR(ds));
goto out_failed;
}
ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
- hdr->inode);
+ hdr->inode, dss_id);
if (IS_ERR(ds_clnt))
goto out_failed;
- ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred);
+ ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred, dss_id);
if (!ds_cred)
goto out_failed;
- vers = nfs4_ff_layout_ds_version(mirror);
+ vers = nfs4_ff_layout_ds_version(mirror, dss_id);
dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n",
__func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
@@ -1981,12 +2248,12 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
hdr->pgio_done_cb = ff_layout_write_done_cb;
refcount_inc(&ds->ds_clp->cl_count);
hdr->ds_clp = ds->ds_clp;
- hdr->ds_commit_idx = idx;
- fh = nfs4_ff_layout_select_ds_fh(mirror);
+ hdr->ds_commit_idx = calc_commit_idx(lseg, idx, dss_id);
+ fh = nfs4_ff_layout_select_ds_fh(mirror, dss_id);
if (fh)
hdr->args.fh = fh;
- nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid);
+ nfs4_ff_layout_select_ds_stateid(mirror, dss_id, &hdr->args.stateid);
/*
* Note that if we ever decide to split across DSes,
@@ -1995,7 +2262,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
hdr->args.offset = offset;
/* Start IO accounting for local write */
- localio = ff_local_open_fh(lseg, idx, ds->ds_clp, ds_cred, fh,
+ localio = ff_local_open_fh(lseg, idx, dss_id, ds->ds_clp, ds_cred, fh,
FMODE_READ|FMODE_WRITE);
if (localio) {
hdr->task.tk_start = ktime_get();
@@ -2019,20 +2286,15 @@ out_failed:
return PNFS_NOT_ATTEMPTED;
}
-static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
-{
- return i;
-}
-
static struct nfs_fh *
-select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i)
+select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i, u32 dss_id)
{
struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
/* FIXME: Assume that there is only one NFS version available
* for the DS.
*/
- return &flseg->mirror_array[i]->fh_versions[0];
+ return &flseg->mirror_array[i]->dss[dss_id].fh_versions[0];
}
static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
@@ -2043,7 +2305,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
struct nfsd_file *localio;
struct nfs4_ff_layout_mirror *mirror;
const struct cred *ds_cred;
- u32 idx;
+ u32 idx, dss_id;
int vers, ret;
struct nfs_fh *fh;
@@ -2051,22 +2313,23 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)))
goto out_err;
- idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
+ idx = calc_mirror_idx_from_commit(lseg, data->ds_commit_index);
mirror = FF_LAYOUT_COMP(lseg, idx);
- ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true);
+ dss_id = calc_dss_id_from_commit(lseg, data->ds_commit_index);
+ ds = nfs4_ff_layout_prepare_ds(lseg, mirror, dss_id, true);
if (IS_ERR(ds))
goto out_err;
ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
- data->inode);
+ data->inode, dss_id);
if (IS_ERR(ds_clnt))
goto out_err;
- ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, data->cred);
+ ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, data->cred, dss_id);
if (!ds_cred)
goto out_err;
- vers = nfs4_ff_layout_ds_version(mirror);
+ vers = nfs4_ff_layout_ds_version(mirror, dss_id);
dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__,
data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count),
@@ -2075,12 +2338,12 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
data->cred = ds_cred;
refcount_inc(&ds->ds_clp->cl_count);
data->ds_clp = ds->ds_clp;
- fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
+ fh = select_ds_fh_from_commit(lseg, idx, dss_id);
if (fh)
data->args.fh = fh;
/* Start IO accounting for local commit */
- localio = ff_local_open_fh(lseg, idx, ds->ds_clp, ds_cred, fh,
+ localio = ff_local_open_fh(lseg, idx, dss_id, ds->ds_clp, ds_cred, fh,
FMODE_READ|FMODE_WRITE);
if (localio) {
data->task.tk_start = ktime_get();
@@ -2144,25 +2407,28 @@ static void ff_layout_cancel_io(struct pnfs_layout_segment *lseg)
struct nfs4_pnfs_ds *ds;
struct nfs_client *ds_clp;
struct rpc_clnt *clnt;
- u32 idx;
+ u32 idx, dss_id;
for (idx = 0; idx < flseg->mirror_array_cnt; idx++) {
mirror = flseg->mirror_array[idx];
- mirror_ds = mirror->mirror_ds;
- if (IS_ERR_OR_NULL(mirror_ds))
- continue;
- ds = mirror->mirror_ds->ds;
- if (!ds)
- continue;
- ds_clp = ds->ds_clp;
- if (!ds_clp)
- continue;
- clnt = ds_clp->cl_rpcclient;
- if (!clnt)
- continue;
- if (!rpc_cancel_tasks(clnt, -EAGAIN, ff_layout_match_io, lseg))
- continue;
- rpc_clnt_disconnect(clnt);
+ for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) {
+ mirror_ds = mirror->dss[dss_id].mirror_ds;
+ if (IS_ERR_OR_NULL(mirror_ds))
+ continue;
+ ds = mirror->dss[dss_id].mirror_ds->ds;
+ if (!ds)
+ continue;
+ ds_clp = ds->ds_clp;
+ if (!ds_clp)
+ continue;
+ clnt = ds_clp->cl_rpcclient;
+ if (!clnt)
+ continue;
+ if (!rpc_cancel_tasks(clnt, -EAGAIN,
+ ff_layout_match_io, lseg))
+ continue;
+ rpc_clnt_disconnect(clnt);
+ }
}
}
@@ -2184,8 +2450,9 @@ ff_layout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
struct inode *inode = lseg->pls_layout->plh_inode;
struct pnfs_commit_array *array, *new;
+ u32 size = flseg->mirror_array_cnt * flseg->mirror_array[0]->dss_count;
- new = pnfs_alloc_commit_array(flseg->mirror_array_cnt,
+ new = pnfs_alloc_commit_array(size,
nfs_io_gfp_mask());
if (new) {
spin_lock(&inode->i_lock);
@@ -2549,11 +2816,11 @@ ff_layout_encode_io_latency(struct xdr_stream *xdr,
static void
ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr,
const struct nfs42_layoutstat_devinfo *devinfo,
- struct nfs4_ff_layout_mirror *mirror)
+ struct nfs4_ff_layout_ds_stripe *dss_info)
{
struct nfs4_pnfs_ds_addr *da;
- struct nfs4_pnfs_ds *ds = mirror->mirror_ds->ds;
- struct nfs_fh *fh = &mirror->fh_versions[0];
+ struct nfs4_pnfs_ds *ds = dss_info->mirror_ds->ds;
+ struct nfs_fh *fh = &dss_info->fh_versions[0];
__be32 *p;
da = list_first_entry(&ds->ds_addrs, struct nfs4_pnfs_ds_addr, da_node);
@@ -2565,13 +2832,17 @@ ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr,
p = xdr_reserve_space(xdr, 4 + fh->size);
xdr_encode_opaque(p, fh->data, fh->size);
/* ff_io_latency4 read */
- spin_lock(&mirror->lock);
- ff_layout_encode_io_latency(xdr, &mirror->read_stat.io_stat);
+ spin_lock(&dss_info->mirror->lock);
+ ff_layout_encode_io_latency(xdr,
+ &dss_info->read_stat.io_stat);
/* ff_io_latency4 write */
- ff_layout_encode_io_latency(xdr, &mirror->write_stat.io_stat);
- spin_unlock(&mirror->lock);
+ ff_layout_encode_io_latency(xdr,
+ &dss_info->write_stat.io_stat);
+ spin_unlock(&dss_info->mirror->lock);
/* nfstime4 */
- ff_layout_encode_nfstime(xdr, ktime_sub(ktime_get(), mirror->start_time));
+ ff_layout_encode_nfstime(xdr,
+ ktime_sub(ktime_get(),
+ dss_info->start_time));
/* bool */
p = xdr_reserve_space(xdr, 4);
*p = cpu_to_be32(false);
@@ -2595,7 +2866,8 @@ ff_layout_encode_layoutstats(struct xdr_stream *xdr, const void *args,
static void
ff_layout_free_layoutstats(struct nfs4_xdr_opaque_data *opaque)
{
- struct nfs4_ff_layout_mirror *mirror = opaque->data;
+ struct nfs4_ff_layout_ds_stripe *dss_info = opaque->data;
+ struct nfs4_ff_layout_mirror *mirror = dss_info->mirror;
ff_layout_put_mirror(mirror);
}
@@ -2612,37 +2884,47 @@ ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
{
struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
struct nfs4_ff_layout_mirror *mirror;
+ struct nfs4_ff_layout_ds_stripe *dss_info;
struct nfs4_deviceid_node *dev;
- int i = 0;
+ int i = 0, dss_id;
list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) {
- if (i >= dev_limit)
- break;
- if (IS_ERR_OR_NULL(mirror->mirror_ds))
- continue;
- if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL,
- &mirror->flags) &&
- type != NFS4_FF_OP_LAYOUTRETURN)
- continue;
- /* mirror refcount put in cleanup_layoutstats */
- if (!refcount_inc_not_zero(&mirror->ref))
- continue;
- dev = &mirror->mirror_ds->id_node;
- memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE);
- devinfo->offset = 0;
- devinfo->length = NFS4_MAX_UINT64;
- spin_lock(&mirror->lock);
- devinfo->read_count = mirror->read_stat.io_stat.ops_completed;
- devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed;
- devinfo->write_count = mirror->write_stat.io_stat.ops_completed;
- devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed;
- spin_unlock(&mirror->lock);
- devinfo->layout_type = LAYOUT_FLEX_FILES;
- devinfo->ld_private.ops = &layoutstat_ops;
- devinfo->ld_private.data = mirror;
-
- devinfo++;
- i++;
+ for (dss_id = 0; dss_id < mirror->dss_count; ++dss_id) {
+ dss_info = &mirror->dss[dss_id];
+ if (i >= dev_limit)
+ break;
+ if (IS_ERR_OR_NULL(dss_info->mirror_ds))
+ continue;
+ if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL,
+ &mirror->flags) &&
+ type != NFS4_FF_OP_LAYOUTRETURN)
+ continue;
+ /* mirror refcount put in cleanup_layoutstats */
+ if (!refcount_inc_not_zero(&mirror->ref))
+ continue;
+ dev = &dss_info->mirror_ds->id_node;
+ memcpy(&devinfo->dev_id,
+ &dev->deviceid,
+ NFS4_DEVICEID4_SIZE);
+ devinfo->offset = 0;
+ devinfo->length = NFS4_MAX_UINT64;
+ spin_lock(&mirror->lock);
+ devinfo->read_count =
+ dss_info->read_stat.io_stat.ops_completed;
+ devinfo->read_bytes =
+ dss_info->read_stat.io_stat.bytes_completed;
+ devinfo->write_count =
+ dss_info->write_stat.io_stat.ops_completed;
+ devinfo->write_bytes =
+ dss_info->write_stat.io_stat.bytes_completed;
+ spin_unlock(&mirror->lock);
+ devinfo->layout_type = LAYOUT_FLEX_FILES;
+ devinfo->ld_private.ops = &layoutstat_ops;
+ devinfo->ld_private.data = &mirror->dss[dss_id];
+
+ devinfo++;
+ i++;
+ }
}
return i;
}
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 095df09017a5..17a008c8e97c 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -21,6 +21,8 @@
* due to network error etc. */
#define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096
+#define NFS4_FLEXFILE_LAYOUT_MAX_STRIPE_CNT 4096
+
/* LAYOUTSTATS report interval in ms */
#define FF_LAYOUTSTATS_REPORT_INTERVAL (60000L)
#define FF_LAYOUTSTATS_MAXDEV 4
@@ -71,12 +73,12 @@ struct nfs4_ff_layoutstat {
struct nfs4_ff_busy_timer busy_timer;
};
-struct nfs4_ff_layout_mirror {
- struct pnfs_layout_hdr *layout;
- struct list_head mirrors;
- u32 ds_count;
- u32 efficiency;
+struct nfs4_ff_layout_mirror;
+
+struct nfs4_ff_layout_ds_stripe {
+ struct nfs4_ff_layout_mirror *mirror;
struct nfs4_deviceid devid;
+ u32 efficiency;
struct nfs4_ff_layout_ds *mirror_ds;
u32 fh_versions_cnt;
struct nfs_fh *fh_versions;
@@ -84,12 +86,19 @@ struct nfs4_ff_layout_mirror {
const struct cred __rcu *ro_cred;
const struct cred __rcu *rw_cred;
struct nfs_file_localio nfl;
- refcount_t ref;
- spinlock_t lock;
- unsigned long flags;
struct nfs4_ff_layoutstat read_stat;
struct nfs4_ff_layoutstat write_stat;
ktime_t start_time;
+};
+
+struct nfs4_ff_layout_mirror {
+ struct pnfs_layout_hdr *layout;
+ struct list_head mirrors;
+ u32 dss_count;
+ struct nfs4_ff_layout_ds_stripe *dss;
+ refcount_t ref;
+ spinlock_t lock;
+ unsigned long flags;
u32 report_interval;
};
@@ -150,12 +159,12 @@ FF_LAYOUT_COMP(struct pnfs_layout_segment *lseg, u32 idx)
}
static inline struct nfs4_deviceid_node *
-FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx)
+FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx, u32 dss_id)
{
struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, idx);
if (mirror != NULL) {
- struct nfs4_ff_layout_ds *mirror_ds = mirror->mirror_ds;
+ struct nfs4_ff_layout_ds *mirror_ds = mirror->dss[dss_id].mirror_ds;
if (!IS_ERR_OR_NULL(mirror_ds))
return &mirror_ds->id_node;
@@ -182,9 +191,22 @@ ff_layout_no_read_on_rw(struct pnfs_layout_segment *lseg)
}
static inline int
-nfs4_ff_layout_ds_version(const struct nfs4_ff_layout_mirror *mirror)
+nfs4_ff_layout_ds_version(const struct nfs4_ff_layout_mirror *mirror, u32 dss_id)
+{
+ return mirror->dss[dss_id].mirror_ds->ds_versions[0].version;
+}
+
+static inline u32
+nfs4_ff_layout_calc_dss_id(const u64 stripe_unit, const u32 dss_count, const loff_t offset)
{
- return mirror->mirror_ds->ds_versions[0].version;
+ u64 tmp = offset;
+
+ if (dss_count == 1 || stripe_unit == 0)
+ return 0;
+
+ do_div(tmp, stripe_unit);
+
+ return do_div(tmp, dss_count);
}
struct nfs4_ff_layout_ds *
@@ -193,9 +215,9 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds);
void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds);
int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
- struct nfs4_ff_layout_mirror *mirror, u64 offset,
- u64 length, int status, enum nfs_opnum4 opnum,
- gfp_t gfp_flags);
+ struct nfs4_ff_layout_mirror *mirror,
+ u32 dss_id, u64 offset, u64 length, int status,
+ enum nfs_opnum4 opnum, gfp_t gfp_flags);
void ff_layout_send_layouterror(struct pnfs_layout_segment *lseg);
int ff_layout_encode_ds_ioerr(struct xdr_stream *xdr, const struct list_head *head);
void ff_layout_free_ds_ioerr(struct list_head *head);
@@ -204,23 +226,27 @@ unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo,
struct list_head *head,
unsigned int maxnum);
struct nfs_fh *
-nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror);
+nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror, u32 dss_id);
void
nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror,
- nfs4_stateid *stateid);
+ u32 dss_id,
+ nfs4_stateid *stateid);
struct nfs4_pnfs_ds *
nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
struct nfs4_ff_layout_mirror *mirror,
+ u32 dss_id,
bool fail_return);
struct rpc_clnt *
nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror,
struct nfs_client *ds_clp,
- struct inode *inode);
+ struct inode *inode,
+ u32 dss_id);
const struct cred *ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror,
const struct pnfs_layout_range *range,
- const struct cred *mdscred);
+ const struct cred *mdscred,
+ u32 dss_id);
bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg);
bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg);
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index 30365ec782bb..c55ea8fa3bfa 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -44,7 +44,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
{
struct xdr_stream stream;
struct xdr_buf buf;
- struct page *scratch;
+ struct folio *scratch;
struct list_head dsaddrs;
struct nfs4_pnfs_ds_addr *da;
struct nfs4_ff_layout_ds *new_ds = NULL;
@@ -56,7 +56,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
int i, ret = -ENOMEM;
/* set up xdr stream */
- scratch = alloc_page(gfp_flags);
+ scratch = folio_alloc(gfp_flags, 0);
if (!scratch)
goto out_err;
@@ -70,7 +70,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
INIT_LIST_HEAD(&dsaddrs);
xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
- xdr_set_scratch_page(&stream, scratch);
+ xdr_set_scratch_folio(&stream, scratch);
/* multipath count */
p = xdr_inline_decode(&stream, 4);
@@ -163,7 +163,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
kfree(da);
}
- __free_page(scratch);
+ folio_put(scratch);
return new_ds;
out_err_drain_dsaddrs:
@@ -177,7 +177,7 @@ out_err_drain_dsaddrs:
kfree(ds_versions);
out_scratch:
- __free_page(scratch);
+ folio_put(scratch);
out_err:
kfree(new_ds);
@@ -250,16 +250,16 @@ ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo,
}
int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
- struct nfs4_ff_layout_mirror *mirror, u64 offset,
- u64 length, int status, enum nfs_opnum4 opnum,
- gfp_t gfp_flags)
+ struct nfs4_ff_layout_mirror *mirror,
+ u32 dss_id, u64 offset, u64 length, int status,
+ enum nfs_opnum4 opnum, gfp_t gfp_flags)
{
struct nfs4_ff_layout_ds_err *dserr;
if (status == 0)
return 0;
- if (IS_ERR_OR_NULL(mirror->mirror_ds))
+ if (IS_ERR_OR_NULL(mirror->dss[dss_id].mirror_ds))
return -EINVAL;
dserr = kmalloc(sizeof(*dserr), gfp_flags);
@@ -271,8 +271,8 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
dserr->length = length;
dserr->status = status;
dserr->opnum = opnum;
- nfs4_stateid_copy(&dserr->stateid, &mirror->stateid);
- memcpy(&dserr->deviceid, &mirror->mirror_ds->id_node.deviceid,
+ nfs4_stateid_copy(&dserr->stateid, &mirror->dss[dss_id].stateid);
+ memcpy(&dserr->deviceid, &mirror->dss[dss_id].mirror_ds->id_node.deviceid,
NFS4_DEVICEID4_SIZE);
spin_lock(&flo->generic_hdr.plh_inode->i_lock);
@@ -282,14 +282,14 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
}
static const struct cred *
-ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode)
+ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode, u32 dss_id)
{
const struct cred *cred, __rcu **pcred;
if (iomode == IOMODE_READ)
- pcred = &mirror->ro_cred;
+ pcred = &mirror->dss[dss_id].ro_cred;
else
- pcred = &mirror->rw_cred;
+ pcred = &mirror->dss[dss_id].rw_cred;
rcu_read_lock();
do {
@@ -304,43 +304,45 @@ ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode)
}
struct nfs_fh *
-nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror)
+nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror, u32 dss_id)
{
/* FIXME: For now assume there is only 1 version available for the DS */
- return &mirror->fh_versions[0];
+ return &mirror->dss[dss_id].fh_versions[0];
}
void
nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror,
- nfs4_stateid *stateid)
+ u32 dss_id,
+ nfs4_stateid *stateid)
{
- if (nfs4_ff_layout_ds_version(mirror) == 4)
- nfs4_stateid_copy(stateid, &mirror->stateid);
+ if (nfs4_ff_layout_ds_version(mirror, dss_id) == 4)
+ nfs4_stateid_copy(stateid, &mirror->dss[dss_id].stateid);
}
static bool
ff_layout_init_mirror_ds(struct pnfs_layout_hdr *lo,
- struct nfs4_ff_layout_mirror *mirror)
+ struct nfs4_ff_layout_mirror *mirror,
+ u32 dss_id)
{
if (mirror == NULL)
goto outerr;
- if (mirror->mirror_ds == NULL) {
+ if (mirror->dss[dss_id].mirror_ds == NULL) {
struct nfs4_deviceid_node *node;
struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV);
node = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode),
- &mirror->devid, lo->plh_lc_cred,
+ &mirror->dss[dss_id].devid, lo->plh_lc_cred,
GFP_KERNEL);
if (node)
mirror_ds = FF_LAYOUT_MIRROR_DS(node);
/* check for race with another call to this function */
- if (cmpxchg(&mirror->mirror_ds, NULL, mirror_ds) &&
+ if (cmpxchg(&mirror->dss[dss_id].mirror_ds, NULL, mirror_ds) &&
mirror_ds != ERR_PTR(-ENODEV))
nfs4_put_deviceid_node(node);
}
- if (IS_ERR(mirror->mirror_ds))
+ if (IS_ERR(mirror->dss[dss_id].mirror_ds))
goto outerr;
return true;
@@ -352,6 +354,7 @@ outerr:
* nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call
* @lseg: the layout segment we're operating on
* @mirror: layout mirror describing the DS to use
+ * @dss_id: DS stripe id to select stripe to use
* @fail_return: return layout on connect failure?
*
* Try to prepare a DS connection to accept an RPC call. This involves
@@ -368,6 +371,7 @@ outerr:
struct nfs4_pnfs_ds *
nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
struct nfs4_ff_layout_mirror *mirror,
+ u32 dss_id,
bool fail_return)
{
struct nfs4_pnfs_ds *ds;
@@ -376,10 +380,10 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
unsigned int max_payload;
int status = -EAGAIN;
- if (!ff_layout_init_mirror_ds(lseg->pls_layout, mirror))
+ if (!ff_layout_init_mirror_ds(lseg->pls_layout, mirror, dss_id))
goto noconnect;
- ds = mirror->mirror_ds->ds;
+ ds = mirror->dss[dss_id].mirror_ds->ds;
if (READ_ONCE(ds->ds_clp))
goto out;
/* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
@@ -388,10 +392,10 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
/* FIXME: For now we assume the server sent only one version of NFS
* to use for the DS.
*/
- status = nfs4_pnfs_ds_connect(s, ds, &mirror->mirror_ds->id_node,
+ status = nfs4_pnfs_ds_connect(s, ds, &mirror->dss[dss_id].mirror_ds->id_node,
dataserver_timeo, dataserver_retrans,
- mirror->mirror_ds->ds_versions[0].version,
- mirror->mirror_ds->ds_versions[0].minor_version);
+ mirror->dss[dss_id].mirror_ds->ds_versions[0].version,
+ mirror->dss[dss_id].mirror_ds->ds_versions[0].minor_version);
/* connect success, check rsize/wsize limit */
if (!status) {
@@ -404,15 +408,15 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
max_payload =
nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient),
NULL);
- if (mirror->mirror_ds->ds_versions[0].rsize > max_payload)
- mirror->mirror_ds->ds_versions[0].rsize = max_payload;
- if (mirror->mirror_ds->ds_versions[0].wsize > max_payload)
- mirror->mirror_ds->ds_versions[0].wsize = max_payload;
+ if (mirror->dss[dss_id].mirror_ds->ds_versions[0].rsize > max_payload)
+ mirror->dss[dss_id].mirror_ds->ds_versions[0].rsize = max_payload;
+ if (mirror->dss[dss_id].mirror_ds->ds_versions[0].wsize > max_payload)
+ mirror->dss[dss_id].mirror_ds->ds_versions[0].wsize = max_payload;
goto out;
}
noconnect:
ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
- mirror, lseg->pls_range.offset,
+ mirror, dss_id, lseg->pls_range.offset,
lseg->pls_range.length, NFS4ERR_NXIO,
OP_ILLEGAL, GFP_NOIO);
ff_layout_send_layouterror(lseg);
@@ -426,12 +430,13 @@ out:
const struct cred *
ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror,
const struct pnfs_layout_range *range,
- const struct cred *mdscred)
+ const struct cred *mdscred,
+ u32 dss_id)
{
const struct cred *cred;
- if (mirror && !mirror->mirror_ds->ds_versions[0].tightly_coupled) {
- cred = ff_layout_get_mirror_cred(mirror, range->iomode);
+ if (mirror && !mirror->dss[dss_id].mirror_ds->ds_versions[0].tightly_coupled) {
+ cred = ff_layout_get_mirror_cred(mirror, range->iomode, dss_id);
if (!cred)
cred = get_cred(mdscred);
} else {
@@ -445,15 +450,17 @@ ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror,
* @mirror: pointer to the mirror
* @ds_clp: nfs_client for the DS
* @inode: pointer to inode
+ * @dss_id: DS stripe id
*
* Find or create a DS rpc client with th MDS server rpc client auth flavor
* in the nfs_client cl_ds_clients list.
*/
struct rpc_clnt *
nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror,
- struct nfs_client *ds_clp, struct inode *inode)
+ struct nfs_client *ds_clp, struct inode *inode,
+ u32 dss_id)
{
- switch (mirror->mirror_ds->ds_versions[0].version) {
+ switch (mirror->dss[dss_id].mirror_ds->ds_versions[0].version) {
case 3:
/* For NFSv3 DS, flavor is set when creating DS connections */
return ds_clp->cl_rpcclient;
@@ -559,16 +566,18 @@ static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg)
{
struct nfs4_ff_layout_mirror *mirror;
struct nfs4_deviceid_node *devid;
- u32 idx;
+ u32 idx, dss_id;
for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
mirror = FF_LAYOUT_COMP(lseg, idx);
- if (mirror) {
- if (!mirror->mirror_ds)
+ if (!mirror)
+ continue;
+ for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) {
+ if (!mirror->dss[dss_id].mirror_ds)
return true;
- if (IS_ERR(mirror->mirror_ds))
+ if (IS_ERR(mirror->dss[dss_id].mirror_ds))
continue;
- devid = &mirror->mirror_ds->id_node;
+ devid = &mirror->dss[dss_id].mirror_ds->id_node;
if (!nfs4_test_deviceid_unavailable(devid))
return true;
}
@@ -581,17 +590,21 @@ static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg)
{
struct nfs4_ff_layout_mirror *mirror;
struct nfs4_deviceid_node *devid;
- u32 idx;
+ u32 idx, dss_id;
for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
mirror = FF_LAYOUT_COMP(lseg, idx);
- if (!mirror || IS_ERR(mirror->mirror_ds))
- return false;
- if (!mirror->mirror_ds)
- continue;
- devid = &mirror->mirror_ds->id_node;
- if (nfs4_test_deviceid_unavailable(devid))
+ if (!mirror)
return false;
+ for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) {
+ if (IS_ERR(mirror->dss[dss_id].mirror_ds))
+ return false;
+ if (!mirror->dss[dss_id].mirror_ds)
+ continue;
+ devid = &mirror->dss[dss_id].mirror_ds->id_node;
+ if (nfs4_test_deviceid_unavailable(devid))
+ return false;
+ }
}
return FF_LAYOUT_MIRROR_COUNT(lseg) != 0;
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index 9e94d18448ff..b4679b7161b0 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -1269,8 +1269,7 @@ static int nfs23_parse_monolithic(struct fs_context *fc,
int ret;
data->context[NFS_MAX_CONTEXT_LEN] = '\0';
- ret = vfs_parse_fs_string(fc, "context",
- data->context, strlen(data->context));
+ ret = vfs_parse_fs_string(fc, "context", data->context);
if (ret < 0)
return ret;
#else
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 9bdaf7f38bed..18b57c7c2f97 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1073,6 +1073,21 @@ out_no_revalidate:
if (S_ISDIR(inode->i_mode))
stat->blksize = NFS_SERVER(inode)->dtsize;
stat->btime = NFS_I(inode)->btime;
+
+ /* Special handling for STATX_DIOALIGN and STATX_DIO_READ_ALIGN
+ * - NFS doesn't have DIO alignment constraints, avoid getting
+ * these DIO attrs from remote and just respond with most
+ * accommodating limits (so client will issue supported DIO).
+ * - this is unintuitive, but the most coarse-grained
+ * dio_offset_align is the most accommodating.
+ */
+ if ((request_mask & (STATX_DIOALIGN | STATX_DIO_READ_ALIGN)) &&
+ S_ISREG(inode->i_mode)) {
+ stat->result_mask |= STATX_DIOALIGN | STATX_DIO_READ_ALIGN;
+ stat->dio_mem_align = 4; /* 4-byte alignment */
+ stat->dio_offset_align = PAGE_SIZE;
+ stat->dio_read_offset_align = stat->dio_offset_align;
+ }
out:
trace_nfs_getattr_exit(inode, err);
return err;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index c0a44f389f8f..2ecd38e1d17a 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -456,6 +456,16 @@ extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
#if IS_ENABLED(CONFIG_NFS_LOCALIO)
/* localio.c */
+struct nfs_local_dio {
+ u32 mem_align;
+ u32 offset_align;
+ loff_t middle_offset;
+ loff_t end_offset;
+ ssize_t start_len; /* Length for misaligned first extent */
+ ssize_t middle_len; /* Length for DIO-aligned middle extent */
+ ssize_t end_len; /* Length for misaligned last extent */
+};
+
extern void nfs_local_probe_async(struct nfs_client *);
extern void nfs_local_probe_async_work(struct work_struct *);
extern struct nfsd_file *nfs_local_open_fh(struct nfs_client *,
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index 97abf62f109d..2c0455e91571 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -30,6 +30,8 @@
#define NFSDBG_FACILITY NFSDBG_VFS
+#define NFSLOCAL_MAX_IOS 3
+
struct nfs_local_kiocb {
struct kiocb kiocb;
struct bio_vec *bvec;
@@ -37,6 +39,14 @@ struct nfs_local_kiocb {
struct work_struct work;
void (*aio_complete_work)(struct work_struct *);
struct nfsd_file *localio;
+ /* Begin mostly DIO-specific members */
+ size_t end_len;
+ short int end_iter_index;
+ short int n_iters;
+ bool iter_is_dio_aligned[NFSLOCAL_MAX_IOS];
+ loff_t offset[NFSLOCAL_MAX_IOS] ____cacheline_aligned;
+ struct iov_iter iters[NFSLOCAL_MAX_IOS];
+ /* End mostly DIO-specific members */
};
struct nfs_local_fsync_ctx {
@@ -49,11 +59,6 @@ struct nfs_local_fsync_ctx {
static bool localio_enabled __read_mostly = true;
module_param(localio_enabled, bool, 0644);
-static bool localio_O_DIRECT_semantics __read_mostly = false;
-module_param(localio_O_DIRECT_semantics, bool, 0644);
-MODULE_PARM_DESC(localio_O_DIRECT_semantics,
- "LOCALIO will use O_DIRECT semantics to filesystem.");
-
static inline bool nfs_client_is_local(const struct nfs_client *clp)
{
return !!rcu_access_pointer(clp->cl_uuid.net);
@@ -231,13 +236,13 @@ __nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
struct nfsd_file __rcu **pnf,
const fmode_t mode)
{
+ int status = 0;
struct nfsd_file *localio;
localio = nfs_open_local_fh(&clp->cl_uuid, clp->cl_rpcclient,
cred, fh, nfl, pnf, mode);
if (IS_ERR(localio)) {
- int status = PTR_ERR(localio);
- trace_nfs_local_open_fh(fh, mode, status);
+ status = PTR_ERR(localio);
switch (status) {
case -ENOMEM:
case -ENXIO:
@@ -247,6 +252,7 @@ __nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
nfs_local_probe(clp);
}
}
+ trace_nfs_local_open_fh(fh, mode, status);
return localio;
}
@@ -281,23 +287,6 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
}
EXPORT_SYMBOL_GPL(nfs_local_open_fh);
-static struct bio_vec *
-nfs_bvec_alloc_and_import_pagevec(struct page **pagevec,
- unsigned int npages, gfp_t flags)
-{
- struct bio_vec *bvec, *p;
-
- bvec = kmalloc_array(npages, sizeof(*bvec), flags);
- if (bvec != NULL) {
- for (p = bvec; npages > 0; p++, pagevec++, npages--) {
- p->bv_page = *pagevec;
- p->bv_len = PAGE_SIZE;
- p->bv_offset = 0;
- }
- }
- return bvec;
-}
-
static void
nfs_local_iocb_free(struct nfs_local_kiocb *iocb)
{
@@ -311,40 +300,191 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr,
{
struct nfs_local_kiocb *iocb;
- iocb = kmalloc(sizeof(*iocb), flags);
+ iocb = kzalloc(sizeof(*iocb), flags);
if (iocb == NULL)
return NULL;
- iocb->bvec = nfs_bvec_alloc_and_import_pagevec(hdr->page_array.pagevec,
- hdr->page_array.npages, flags);
+
+ iocb->bvec = kmalloc_array(hdr->page_array.npages,
+ sizeof(struct bio_vec), flags);
if (iocb->bvec == NULL) {
kfree(iocb);
return NULL;
}
- if (localio_O_DIRECT_semantics &&
- test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) {
- iocb->kiocb.ki_filp = file;
- iocb->kiocb.ki_flags = IOCB_DIRECT;
- } else
- init_sync_kiocb(&iocb->kiocb, file);
+ init_sync_kiocb(&iocb->kiocb, file);
- iocb->kiocb.ki_pos = hdr->args.offset;
iocb->hdr = hdr;
iocb->kiocb.ki_flags &= ~IOCB_APPEND;
iocb->aio_complete_work = NULL;
+ iocb->end_iter_index = -1;
+
return iocb;
}
-static void
-nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int dir)
+static bool
+nfs_is_local_dio_possible(struct nfs_local_kiocb *iocb, int rw,
+ size_t len, struct nfs_local_dio *local_dio)
+{
+ struct nfs_pgio_header *hdr = iocb->hdr;
+ loff_t offset = hdr->args.offset;
+ u32 nf_dio_mem_align, nf_dio_offset_align, nf_dio_read_offset_align;
+ loff_t start_end, orig_end, middle_end;
+
+ nfs_to->nfsd_file_dio_alignment(iocb->localio, &nf_dio_mem_align,
+ &nf_dio_offset_align, &nf_dio_read_offset_align);
+ if (rw == ITER_DEST)
+ nf_dio_offset_align = nf_dio_read_offset_align;
+
+ if (unlikely(!nf_dio_mem_align || !nf_dio_offset_align))
+ return false;
+ if (unlikely(nf_dio_offset_align > PAGE_SIZE))
+ return false;
+ if (unlikely(len < nf_dio_offset_align))
+ return false;
+
+ local_dio->mem_align = nf_dio_mem_align;
+ local_dio->offset_align = nf_dio_offset_align;
+
+ start_end = round_up(offset, nf_dio_offset_align);
+ orig_end = offset + len;
+ middle_end = round_down(orig_end, nf_dio_offset_align);
+
+ local_dio->middle_offset = start_end;
+ local_dio->end_offset = middle_end;
+
+ local_dio->start_len = start_end - offset;
+ local_dio->middle_len = middle_end - start_end;
+ local_dio->end_len = orig_end - middle_end;
+
+ if (rw == ITER_DEST)
+ trace_nfs_local_dio_read(hdr->inode, offset, len, local_dio);
+ else
+ trace_nfs_local_dio_write(hdr->inode, offset, len, local_dio);
+ return true;
+}
+
+static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i,
+ unsigned int addr_mask, unsigned int len_mask)
+{
+ const struct bio_vec *bvec = i->bvec;
+ size_t skip = i->iov_offset;
+ size_t size = i->count;
+
+ if (size & len_mask)
+ return false;
+ do {
+ size_t len = bvec->bv_len;
+
+ if (len > size)
+ len = size;
+ if ((unsigned long)(bvec->bv_offset + skip) & addr_mask)
+ return false;
+ bvec++;
+ size -= len;
+ skip = 0;
+ } while (size);
+
+ return true;
+}
+
+/*
+ * Setup as many as 3 iov_iter based on extents described by @local_dio.
+ * Returns the number of iov_iter that were setup.
+ */
+static int
+nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw,
+ unsigned int nvecs, size_t len,
+ struct nfs_local_dio *local_dio)
+{
+ int n_iters = 0;
+ struct iov_iter *iters = iocb->iters;
+
+ /* Setup misaligned start? */
+ if (local_dio->start_len) {
+ iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
+ iters[n_iters].count = local_dio->start_len;
+ iocb->offset[n_iters] = iocb->hdr->args.offset;
+ iocb->iter_is_dio_aligned[n_iters] = false;
+ ++n_iters;
+ }
+
+ /* Setup misaligned end?
+ * If so, the end is purposely setup to be issued using buffered IO
+ * before the middle (which will use DIO, if DIO-aligned, with AIO).
+ * This creates problems if/when the end results in a partial write.
+ * So must save index and length of end to handle this corner case.
+ */
+ if (local_dio->end_len) {
+ iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
+ iocb->offset[n_iters] = local_dio->end_offset;
+ iov_iter_advance(&iters[n_iters],
+ local_dio->start_len + local_dio->middle_len);
+ iocb->iter_is_dio_aligned[n_iters] = false;
+ /* Save index and length of end */
+ iocb->end_iter_index = n_iters;
+ iocb->end_len = local_dio->end_len;
+ ++n_iters;
+ }
+
+ /* Setup DIO-aligned middle to be issued last, to allow for
+ * DIO with AIO completion (see nfs_local_call_{read,write}).
+ */
+ iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
+ if (local_dio->start_len)
+ iov_iter_advance(&iters[n_iters], local_dio->start_len);
+ iters[n_iters].count -= local_dio->end_len;
+ iocb->offset[n_iters] = local_dio->middle_offset;
+
+ iocb->iter_is_dio_aligned[n_iters] =
+ nfs_iov_iter_aligned_bvec(&iters[n_iters],
+ local_dio->mem_align-1, local_dio->offset_align-1);
+
+ if (unlikely(!iocb->iter_is_dio_aligned[n_iters])) {
+ trace_nfs_local_dio_misaligned(iocb->hdr->inode,
+ iocb->hdr->args.offset, len, local_dio);
+ return 0; /* no DIO-aligned IO possible */
+ }
+ ++n_iters;
+
+ iocb->n_iters = n_iters;
+ return n_iters;
+}
+
+static noinline_for_stack void
+nfs_local_iters_init(struct nfs_local_kiocb *iocb, int rw)
{
struct nfs_pgio_header *hdr = iocb->hdr;
+ struct page **pagevec = hdr->page_array.pagevec;
+ unsigned long v, total;
+ unsigned int base;
+ size_t len;
+
+ v = 0;
+ total = hdr->args.count;
+ base = hdr->args.pgbase;
+ while (total && v < hdr->page_array.npages) {
+ len = min_t(size_t, total, PAGE_SIZE - base);
+ bvec_set_page(&iocb->bvec[v], *pagevec, len, base);
+ total -= len;
+ ++pagevec;
+ ++v;
+ base = 0;
+ }
+ len = hdr->args.count - total;
+
+ if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) {
+ struct nfs_local_dio local_dio;
+
+ if (nfs_is_local_dio_possible(iocb, rw, len, &local_dio) &&
+ nfs_local_iters_setup_dio(iocb, rw, v, len, &local_dio) != 0)
+ return; /* is DIO-aligned */
+ }
- iov_iter_bvec(i, dir, iocb->bvec, hdr->page_array.npages,
- hdr->args.count + hdr->args.pgbase);
- if (hdr->args.pgbase != 0)
- iov_iter_advance(i, hdr->args.pgbase);
+ /* Use buffered IO */
+ iocb->offset[0] = hdr->args.offset;
+ iov_iter_bvec(&iocb->iters[0], rw, iocb->bvec, v, len);
+ iocb->n_iters = 1;
}
static void
@@ -367,10 +507,12 @@ nfs_local_pgio_init(struct nfs_pgio_header *hdr,
static void
nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status)
{
+ /* Must handle partial completions */
if (status >= 0) {
- hdr->res.count = status;
- hdr->res.op_status = NFS4_OK;
- hdr->task.tk_status = 0;
+ hdr->res.count += status;
+ /* @hdr was initialized to 0 (zeroed during allocation) */
+ if (hdr->task.tk_status == 0)
+ hdr->res.op_status = NFS4_OK;
} else {
hdr->res.op_status = nfs_localio_errno_to_nfs4_stat(status);
hdr->task.tk_status = status;
@@ -378,12 +520,18 @@ nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status)
}
static void
+nfs_local_iocb_release(struct nfs_local_kiocb *iocb)
+{
+ nfs_local_file_put(iocb->localio);
+ nfs_local_iocb_free(iocb);
+}
+
+static void
nfs_local_pgio_release(struct nfs_local_kiocb *iocb)
{
struct nfs_pgio_header *hdr = iocb->hdr;
- nfs_local_file_put(iocb->localio);
- nfs_local_iocb_free(iocb);
+ nfs_local_iocb_release(iocb);
nfs_local_hdr_release(hdr, hdr->task.tk_ops);
}
@@ -405,7 +553,10 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status)
struct nfs_pgio_header *hdr = iocb->hdr;
struct file *filp = iocb->kiocb.ki_filp;
- nfs_local_pgio_done(hdr, status);
+ if ((iocb->kiocb.ki_flags & IOCB_DIRECT) && status == -EINVAL) {
+ /* Underlying FS will return -EINVAL if misaligned DIO is attempted. */
+ pr_info_ratelimited("nfs: Unexpected direct I/O read alignment failure\n");
+ }
/*
* Must clear replen otherwise NFSv3 data corruption will occur
@@ -434,6 +585,7 @@ static void nfs_local_read_aio_complete(struct kiocb *kiocb, long ret)
struct nfs_local_kiocb *iocb =
container_of(kiocb, struct nfs_local_kiocb, kiocb);
+ nfs_local_pgio_done(iocb->hdr, ret);
nfs_local_read_done(iocb, ret);
nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_read_aio_complete_work */
}
@@ -444,14 +596,25 @@ static void nfs_local_call_read(struct work_struct *work)
container_of(work, struct nfs_local_kiocb, work);
struct file *filp = iocb->kiocb.ki_filp;
const struct cred *save_cred;
- struct iov_iter iter;
ssize_t status;
save_cred = override_creds(filp->f_cred);
- nfs_local_iter_init(&iter, iocb, READ);
+ for (int i = 0; i < iocb->n_iters ; i++) {
+ if (iocb->iter_is_dio_aligned[i]) {
+ iocb->kiocb.ki_flags |= IOCB_DIRECT;
+ iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
+ iocb->aio_complete_work = nfs_local_read_aio_complete_work;
+ }
- status = filp->f_op->read_iter(&iocb->kiocb, &iter);
+ iocb->kiocb.ki_pos = iocb->offset[i];
+ status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iters[i]);
+ if (status != -EIOCBQUEUED) {
+ nfs_local_pgio_done(iocb->hdr, status);
+ if (iocb->hdr->task.tk_status)
+ break;
+ }
+ }
revert_creds(save_cred);
@@ -462,33 +625,17 @@ static void nfs_local_call_read(struct work_struct *work)
}
static int
-nfs_do_local_read(struct nfs_pgio_header *hdr,
- struct nfsd_file *localio,
+nfs_local_do_read(struct nfs_local_kiocb *iocb,
const struct rpc_call_ops *call_ops)
{
- struct nfs_local_kiocb *iocb;
- struct file *file = nfs_to->nfsd_file_file(localio);
-
- /* Don't support filesystems without read_iter */
- if (!file->f_op->read_iter)
- return -EAGAIN;
+ struct nfs_pgio_header *hdr = iocb->hdr;
dprintk("%s: vfs_read count=%u pos=%llu\n",
__func__, hdr->args.count, hdr->args.offset);
- iocb = nfs_local_iocb_alloc(hdr, file, GFP_KERNEL);
- if (iocb == NULL)
- return -ENOMEM;
- iocb->localio = localio;
-
nfs_local_pgio_init(hdr, call_ops);
hdr->res.eof = false;
- if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
- iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
- iocb->aio_complete_work = nfs_local_read_aio_complete_work;
- }
-
INIT_WORK(&iocb->work, nfs_local_call_read);
queue_work(nfslocaliod_workqueue, &iocb->work);
@@ -529,7 +676,7 @@ nfs_set_local_verifier(struct inode *inode,
}
/* Factored out from fs/nfsd/vfs.h:fh_getattr() */
-static int __vfs_getattr(struct path *p, struct kstat *stat, int version)
+static int __vfs_getattr(const struct path *p, struct kstat *stat, int version)
{
u32 request_mask = STATX_BASIC_STATS;
@@ -597,7 +744,13 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status)
dprintk("%s: wrote %ld bytes.\n", __func__, status > 0 ? status : 0);
+ if ((iocb->kiocb.ki_flags & IOCB_DIRECT) && status == -EINVAL) {
+ /* Underlying FS will return -EINVAL if misaligned DIO is attempted. */
+ pr_info_ratelimited("nfs: Unexpected direct I/O write alignment failure\n");
+ }
+
/* Handle short writes as if they are ENOSPC */
+ status = hdr->res.count;
if (status > 0 && status < hdr->args.count) {
hdr->mds_offset += status;
hdr->args.offset += status;
@@ -605,11 +758,11 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status)
hdr->args.count -= status;
nfs_set_pgio_error(hdr, -ENOSPC, hdr->args.offset);
status = -ENOSPC;
+ /* record -ENOSPC in terms of nfs_local_pgio_done */
+ nfs_local_pgio_done(hdr, status);
}
- if (status < 0)
+ if (hdr->task.tk_status < 0)
nfs_reset_boot_verifier(inode);
-
- nfs_local_pgio_done(hdr, status);
}
static void nfs_local_write_aio_complete_work(struct work_struct *work)
@@ -626,6 +779,7 @@ static void nfs_local_write_aio_complete(struct kiocb *kiocb, long ret)
struct nfs_local_kiocb *iocb =
container_of(kiocb, struct nfs_local_kiocb, kiocb);
+ nfs_local_pgio_done(iocb->hdr, ret);
nfs_local_write_done(iocb, ret);
nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_write_aio_complete_work */
}
@@ -637,16 +791,53 @@ static void nfs_local_call_write(struct work_struct *work)
struct file *filp = iocb->kiocb.ki_filp;
unsigned long old_flags = current->flags;
const struct cred *save_cred;
- struct iov_iter iter;
ssize_t status;
current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
save_cred = override_creds(filp->f_cred);
- nfs_local_iter_init(&iter, iocb, WRITE);
-
file_start_write(filp);
- status = filp->f_op->write_iter(&iocb->kiocb, &iter);
+ for (int i = 0; i < iocb->n_iters ; i++) {
+ if (iocb->iter_is_dio_aligned[i]) {
+ iocb->kiocb.ki_flags |= IOCB_DIRECT;
+ iocb->kiocb.ki_complete = nfs_local_write_aio_complete;
+ iocb->aio_complete_work = nfs_local_write_aio_complete_work;
+ }
+retry:
+ iocb->kiocb.ki_pos = iocb->offset[i];
+ status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iters[i]);
+ if (status != -EIOCBQUEUED) {
+ if (unlikely(status >= 0 && status < iocb->iters[i].count)) {
+ /* partial write */
+ if (i == iocb->end_iter_index) {
+ /* Must not account partial end, otherwise, due
+ * to end being issued before middle: the partial
+ * write accounting in nfs_local_write_done()
+ * would incorrectly advance hdr->args.offset
+ */
+ status = 0;
+ } else {
+ /* Partial write at start or buffered middle,
+ * exit early.
+ */
+ nfs_local_pgio_done(iocb->hdr, status);
+ break;
+ }
+ } else if (unlikely(status == -ENOTBLK &&
+ (iocb->kiocb.ki_flags & IOCB_DIRECT))) {
+ /* VFS will return -ENOTBLK if DIO WRITE fails to
+ * invalidate the page cache. Retry using buffered IO.
+ */
+ iocb->kiocb.ki_flags &= ~IOCB_DIRECT;
+ iocb->kiocb.ki_complete = NULL;
+ iocb->aio_complete_work = NULL;
+ goto retry;
+ }
+ nfs_local_pgio_done(iocb->hdr, status);
+ if (iocb->hdr->task.tk_status)
+ break;
+ }
+ }
file_end_write(filp);
revert_creds(save_cred);
@@ -660,26 +851,15 @@ static void nfs_local_call_write(struct work_struct *work)
}
static int
-nfs_do_local_write(struct nfs_pgio_header *hdr,
- struct nfsd_file *localio,
+nfs_local_do_write(struct nfs_local_kiocb *iocb,
const struct rpc_call_ops *call_ops)
{
- struct nfs_local_kiocb *iocb;
- struct file *file = nfs_to->nfsd_file_file(localio);
-
- /* Don't support filesystems without write_iter */
- if (!file->f_op->write_iter)
- return -EAGAIN;
+ struct nfs_pgio_header *hdr = iocb->hdr;
dprintk("%s: vfs_write count=%u pos=%llu %s\n",
__func__, hdr->args.count, hdr->args.offset,
(hdr->args.stable == NFS_UNSTABLE) ? "unstable" : "stable");
- iocb = nfs_local_iocb_alloc(hdr, file, GFP_NOIO);
- if (iocb == NULL)
- return -ENOMEM;
- iocb->localio = localio;
-
switch (hdr->args.stable) {
default:
break;
@@ -694,43 +874,74 @@ nfs_do_local_write(struct nfs_pgio_header *hdr,
nfs_set_local_verifier(hdr->inode, hdr->res.verf, hdr->args.stable);
- if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
- iocb->kiocb.ki_complete = nfs_local_write_aio_complete;
- iocb->aio_complete_work = nfs_local_write_aio_complete_work;
- }
-
INIT_WORK(&iocb->work, nfs_local_call_write);
queue_work(nfslocaliod_workqueue, &iocb->work);
return 0;
}
+static struct nfs_local_kiocb *
+nfs_local_iocb_init(struct nfs_pgio_header *hdr, struct nfsd_file *localio)
+{
+ struct file *file = nfs_to->nfsd_file_file(localio);
+ struct nfs_local_kiocb *iocb;
+ gfp_t gfp_mask;
+ int rw;
+
+ if (hdr->rw_mode & FMODE_READ) {
+ if (!file->f_op->read_iter)
+ return ERR_PTR(-EOPNOTSUPP);
+ gfp_mask = GFP_KERNEL;
+ rw = ITER_DEST;
+ } else {
+ if (!file->f_op->write_iter)
+ return ERR_PTR(-EOPNOTSUPP);
+ gfp_mask = GFP_NOIO;
+ rw = ITER_SOURCE;
+ }
+
+ iocb = nfs_local_iocb_alloc(hdr, file, gfp_mask);
+ if (iocb == NULL)
+ return ERR_PTR(-ENOMEM);
+ iocb->hdr = hdr;
+ iocb->localio = localio;
+
+ nfs_local_iters_init(iocb, rw);
+
+ return iocb;
+}
+
int nfs_local_doio(struct nfs_client *clp, struct nfsd_file *localio,
struct nfs_pgio_header *hdr,
const struct rpc_call_ops *call_ops)
{
+ struct nfs_local_kiocb *iocb;
int status = 0;
if (!hdr->args.count)
return 0;
+ iocb = nfs_local_iocb_init(hdr, localio);
+ if (IS_ERR(iocb))
+ return PTR_ERR(iocb);
+
switch (hdr->rw_mode) {
case FMODE_READ:
- status = nfs_do_local_read(hdr, localio, call_ops);
+ status = nfs_local_do_read(iocb, call_ops);
break;
case FMODE_WRITE:
- status = nfs_do_local_write(hdr, localio, call_ops);
+ status = nfs_local_do_write(iocb, call_ops);
break;
default:
dprintk("%s: invalid mode: %d\n", __func__,
hdr->rw_mode);
- status = -EINVAL;
+ status = -EOPNOTSUPP;
}
if (status != 0) {
if (status == -EAGAIN)
nfs_localio_disable_client(clp);
- nfs_local_file_put(localio);
+ nfs_local_iocb_release(iocb);
hdr->task.tk_status = status;
nfs_local_hdr_release(hdr, call_ops);
}
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index f9a3a1fbf44c..5a4d193da1a9 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -290,7 +290,8 @@ int nfs_do_submount(struct fs_context *fc)
nfs_errorf(fc, "NFS: Couldn't determine submount pathname");
ret = PTR_ERR(p);
} else {
- ret = vfs_parse_fs_string(fc, "source", p, buffer + 4096 - p);
+ ret = vfs_parse_fs_qstr(fc, "source",
+ &QSTR_LEN(p, buffer + 4096 - p));
if (!ret)
ret = vfs_get_tree(fc);
}
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 6e75c6c2d234..9eff09158518 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -23,8 +23,8 @@
#include <linux/nfs2.h>
#include <linux/nfs_fs.h>
#include <linux/nfs_common.h>
-#include "nfstrace.h"
#include "internal.h"
+#include "nfstrace.h"
#define NFSDBG_FACILITY NFSDBG_XDR
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 4ae01c10b7e2..e17d72908412 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -23,8 +23,8 @@
#include <linux/nfsacl.h>
#include <linux/nfs_common.h>
-#include "nfstrace.h"
#include "internal.h"
+#include "nfstrace.h"
#define NFSDBG_FACILITY NFSDBG_XDR
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 6a0b5871ba3b..d537fb0c230e 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -1514,7 +1514,7 @@ static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf,
ret = -ENOMEM;
- res.scratch = alloc_page(GFP_KERNEL);
+ res.scratch = folio_alloc(GFP_KERNEL, 0);
if (!res.scratch)
goto out;
@@ -1552,7 +1552,7 @@ out_free_pages:
}
kfree(pages);
out_free_scratch:
- __free_page(res.scratch);
+ folio_put(res.scratch);
out:
return ret;
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 4cc915d5741d..e10d83ba835e 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -1781,7 +1781,7 @@ static int nfs4_xdr_dec_listxattrs(struct rpc_rqst *rqstp,
struct compound_hdr hdr;
int status;
- xdr_set_scratch_page(xdr, res->scratch);
+ xdr_set_scratch_folio(xdr, res->scratch);
status = decode_compound_hdr(xdr, &hdr);
if (status)
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index c9a0d1e420c6..7f43e890d356 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -456,4 +456,5 @@ const struct file_operations nfs4_file_operations = {
#else
.llseek = nfs_file_llseek,
#endif
+ .fop_flags = FOP_DONTCACHE,
};
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ce61253efd45..f58098417142 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -391,7 +391,9 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
*p++ = htonl(attrs); /* bitmap */
*p++ = htonl(12); /* attribute buffer length */
*p++ = htonl(NF4DIR);
+ spin_lock(&dentry->d_lock);
p = xdr_encode_hyper(p, NFS_FILEID(d_inode(dentry->d_parent)));
+ spin_unlock(&dentry->d_lock);
readdir->pgbase = (char *)p - (char *)start;
readdir->count -= readdir->pgbase;
@@ -6160,7 +6162,7 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf,
}
/* for decoding across pages */
- res.acl_scratch = alloc_page(GFP_KERNEL);
+ res.acl_scratch = folio_alloc(GFP_KERNEL, 0);
if (!res.acl_scratch)
goto out_free;
@@ -6196,7 +6198,7 @@ out_free:
while (--i >= 0)
__free_page(pages[i]);
if (res.acl_scratch)
- __free_page(res.acl_scratch);
+ folio_put(res.acl_scratch);
kfree(pages);
return ret;
}
@@ -7872,10 +7874,10 @@ int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state,
return err;
do {
err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW);
- if (err != -NFS4ERR_DELAY)
+ if (err != -NFS4ERR_DELAY && err != -NFS4ERR_GRACE)
break;
ssleep(1);
- } while (err == -NFS4ERR_DELAY);
+ } while (err == -NFS4ERR_DELAY || err == -NFSERR_GRACE);
return nfs4_handle_delegation_recall_error(server, state, stateid, fl, err);
}
@@ -9442,7 +9444,7 @@ static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args
goto out;
if (rcvd->max_rqst_sz > sent->max_rqst_sz)
return -EINVAL;
- if (rcvd->max_resp_sz < sent->max_resp_sz)
+ if (rcvd->max_resp_sz > sent->max_resp_sz)
return -EINVAL;
if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached)
return -EINVAL;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 7612e977e80b..01179f7de322 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -2744,6 +2744,9 @@ out_error:
case -ENETUNREACH:
nfs_mark_client_ready(clp, -EIO);
break;
+ case -EINVAL:
+ nfs_mark_client_ready(clp, status);
+ break;
default:
ssleep(1);
break;
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index b29a26923ce0..5ec9c83f1ef0 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -149,21 +149,9 @@ static int do_nfs4_mount(struct nfs_server *server,
struct fs_context *root_fc;
struct vfsmount *root_mnt;
struct dentry *dentry;
- size_t len;
+ char *source;
int ret;
- struct fs_parameter param = {
- .key = "source",
- .type = fs_value_is_string,
- .dirfd = -1,
- };
-
- struct fs_parameter param_fsc = {
- .key = "fsc",
- .type = fs_value_is_string,
- .dirfd = -1,
- };
-
if (IS_ERR(server))
return PTR_ERR(server);
@@ -181,15 +169,7 @@ static int do_nfs4_mount(struct nfs_server *server,
root_ctx->server = server;
if (ctx->fscache_uniq) {
- len = strlen(ctx->fscache_uniq);
- param_fsc.size = len;
- param_fsc.string = kmemdup_nul(ctx->fscache_uniq, len, GFP_KERNEL);
- if (param_fsc.string == NULL) {
- put_fs_context(root_fc);
- return -ENOMEM;
- }
- ret = vfs_parse_fs_param(root_fc, &param_fsc);
- kfree(param_fsc.string);
+ ret = vfs_parse_fs_string(root_fc, "fsc", ctx->fscache_uniq);
if (ret < 0) {
put_fs_context(root_fc);
return ret;
@@ -197,20 +177,18 @@ static int do_nfs4_mount(struct nfs_server *server,
}
/* We leave export_path unset as it's not used to find the root. */
- len = strlen(hostname) + 5;
- param.string = kmalloc(len, GFP_KERNEL);
- if (param.string == NULL) {
- put_fs_context(root_fc);
- return -ENOMEM;
- }
-
/* Does hostname needs to be enclosed in brackets? */
if (strchr(hostname, ':'))
- param.size = snprintf(param.string, len, "[%s]:/", hostname);
+ source = kasprintf(GFP_KERNEL, "[%s]:/", hostname);
else
- param.size = snprintf(param.string, len, "%s:/", hostname);
- ret = vfs_parse_fs_param(root_fc, &param);
- kfree(param.string);
+ source = kasprintf(GFP_KERNEL, "%s:/", hostname);
+
+ if (!source) {
+ put_fs_context(root_fc);
+ return -ENOMEM;
+ }
+ ret = vfs_parse_fs_string(root_fc, "source", source);
+ kfree(source);
if (ret < 0) {
put_fs_context(root_fc);
return ret;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 49ff98571fa5..1d0e6c10f921 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4930,7 +4930,7 @@ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
}
/*
- * The prefered block size for layout directed io
+ * The preferred block size for layout directed io
*/
static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
uint32_t *res)
@@ -6585,7 +6585,7 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
int status;
if (res->acl_scratch != NULL)
- xdr_set_scratch_page(xdr, res->acl_scratch);
+ xdr_set_scratch_folio(xdr, res->acl_scratch);
status = decode_compound_hdr(xdr, &hdr);
if (status)
goto out;
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 627115179795..6ce55e8e6b67 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -45,6 +45,23 @@
{ BIT(NFS_INO_LAYOUTSTATS), "LAYOUTSTATS" }, \
{ BIT(NFS_INO_ODIRECT), "ODIRECT" })
+#define nfs_show_wb_flags(v) \
+ __print_flags(v, "|", \
+ { BIT(PG_BUSY), "BUSY" }, \
+ { BIT(PG_MAPPED), "MAPPED" }, \
+ { BIT(PG_FOLIO), "FOLIO" }, \
+ { BIT(PG_CLEAN), "CLEAN" }, \
+ { BIT(PG_COMMIT_TO_DS), "COMMIT_TO_DS" }, \
+ { BIT(PG_INODE_REF), "INODE_REF" }, \
+ { BIT(PG_HEADLOCK), "HEADLOCK" }, \
+ { BIT(PG_TEARDOWN), "TEARDOWN" }, \
+ { BIT(PG_UNLOCKPAGE), "UNLOCKPAGE" }, \
+ { BIT(PG_UPTODATE), "UPTODATE" }, \
+ { BIT(PG_WB_END), "WB_END" }, \
+ { BIT(PG_REMOVE), "REMOVE" }, \
+ { BIT(PG_CONTENDED1), "CONTENDED1" }, \
+ { BIT(PG_CONTENDED2), "CONTENDED2" })
+
DECLARE_EVENT_CLASS(nfs_inode_event,
TP_PROTO(
const struct inode *inode
@@ -967,7 +984,7 @@ DECLARE_EVENT_CLASS(nfs_folio_event,
__entry->fileid = nfsi->fileid;
__entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
__entry->version = inode_peek_iversion_raw(inode);
- __entry->offset = offset,
+ __entry->offset = offset;
__entry->count = count;
),
@@ -1017,8 +1034,8 @@ DECLARE_EVENT_CLASS(nfs_folio_event_done,
__entry->fileid = nfsi->fileid;
__entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
__entry->version = inode_peek_iversion_raw(inode);
- __entry->offset = offset,
- __entry->count = count,
+ __entry->offset = offset;
+ __entry->count = count;
__entry->ret = ret;
),
@@ -1051,6 +1068,73 @@ DEFINE_NFS_FOLIO_EVENT_DONE(nfs_writeback_folio_done);
DEFINE_NFS_FOLIO_EVENT(nfs_invalidate_folio);
DEFINE_NFS_FOLIO_EVENT_DONE(nfs_launder_folio_done);
+DEFINE_NFS_FOLIO_EVENT(nfs_try_to_update_request);
+DEFINE_NFS_FOLIO_EVENT_DONE(nfs_try_to_update_request_done);
+
+DEFINE_NFS_FOLIO_EVENT(nfs_update_folio);
+DEFINE_NFS_FOLIO_EVENT_DONE(nfs_update_folio_done);
+
+DEFINE_NFS_FOLIO_EVENT(nfs_write_begin);
+DEFINE_NFS_FOLIO_EVENT_DONE(nfs_write_begin_done);
+
+DEFINE_NFS_FOLIO_EVENT(nfs_write_end);
+DEFINE_NFS_FOLIO_EVENT_DONE(nfs_write_end_done);
+
+DEFINE_NFS_FOLIO_EVENT(nfs_writepages);
+DEFINE_NFS_FOLIO_EVENT_DONE(nfs_writepages_done);
+
+DECLARE_EVENT_CLASS(nfs_kiocb_event,
+ TP_PROTO(
+ const struct kiocb *iocb,
+ const struct iov_iter *iter
+ ),
+
+ TP_ARGS(iocb, iter),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(u64, version)
+ __field(loff_t, offset)
+ __field(size_t, count)
+ __field(int, flags)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = file_inode(iocb->ki_filp);
+ const struct nfs_inode *nfsi = NFS_I(inode);
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->version = inode_peek_iversion_raw(inode);
+ __entry->offset = iocb->ki_pos;
+ __entry->count = iov_iter_count(iter);
+ __entry->flags = iocb->ki_flags;
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu offset=%lld count=%zu ki_flags=%s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __entry->version,
+ __entry->offset, __entry->count,
+ __print_flags(__entry->flags, "|", TRACE_IOCB_STRINGS)
+ )
+);
+
+#define DEFINE_NFS_KIOCB_EVENT(name) \
+ DEFINE_EVENT(nfs_kiocb_event, name, \
+ TP_PROTO( \
+ const struct kiocb *iocb, \
+ const struct iov_iter *iter \
+ ), \
+ TP_ARGS(iocb, iter))
+
+DEFINE_NFS_KIOCB_EVENT(nfs_file_read);
+DEFINE_NFS_KIOCB_EVENT(nfs_file_write);
+
TRACE_EVENT(nfs_aop_readahead,
TP_PROTO(
const struct inode *inode,
@@ -1398,6 +1482,55 @@ TRACE_EVENT(nfs_writeback_done,
)
);
+DECLARE_EVENT_CLASS(nfs_page_class,
+ TP_PROTO(
+ const struct nfs_page *req
+ ),
+
+ TP_ARGS(req),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(const struct nfs_page *__private, req)
+ __field(loff_t, offset)
+ __field(unsigned int, count)
+ __field(unsigned long, flags)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = folio_inode(req->wb_folio);
+ const struct nfs_inode *nfsi = NFS_I(inode);
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->req = req;
+ __entry->offset = req_offset(req);
+ __entry->count = req->wb_bytes;
+ __entry->flags = req->wb_flags;
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x req=%p offset=%lld count=%u flags=%s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid, __entry->fhandle,
+ __entry->req, __entry->offset, __entry->count,
+ nfs_show_wb_flags(__entry->flags)
+ )
+);
+
+#define DEFINE_NFS_PAGE_EVENT(name) \
+ DEFINE_EVENT(nfs_page_class, name, \
+ TP_PROTO( \
+ const struct nfs_page *req \
+ ), \
+ TP_ARGS(req))
+
+DEFINE_NFS_PAGE_EVENT(nfs_writepage_setup);
+DEFINE_NFS_PAGE_EVENT(nfs_do_writepage);
+
DECLARE_EVENT_CLASS(nfs_page_error_class,
TP_PROTO(
const struct inode *inode,
@@ -1599,6 +1732,76 @@ DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_completion);
DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_schedule_iovec);
DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_reschedule_io);
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+
+DECLARE_EVENT_CLASS(nfs_local_dio_class,
+ TP_PROTO(
+ const struct inode *inode,
+ loff_t offset,
+ ssize_t count,
+ const struct nfs_local_dio *local_dio
+ ),
+ TP_ARGS(inode, offset, count, local_dio),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, fileid)
+ __field(u32, fhandle)
+ __field(loff_t, offset)
+ __field(ssize_t, count)
+ __field(u32, mem_align)
+ __field(u32, offset_align)
+ __field(loff_t, start)
+ __field(ssize_t, start_len)
+ __field(loff_t, middle)
+ __field(ssize_t, middle_len)
+ __field(loff_t, end)
+ __field(ssize_t, end_len)
+ ),
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = &nfsi->fh;
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ __entry->offset = offset;
+ __entry->count = count;
+ __entry->mem_align = local_dio->mem_align;
+ __entry->offset_align = local_dio->offset_align;
+ __entry->start = offset;
+ __entry->start_len = local_dio->start_len;
+ __entry->middle = local_dio->middle_offset;
+ __entry->middle_len = local_dio->middle_len;
+ __entry->end = local_dio->end_offset;
+ __entry->end_len = local_dio->end_len;
+ ),
+ TP_printk("fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%zd "
+ "mem_align=%u offset_align=%u "
+ "start=%llu+%zd middle=%llu+%zd end=%llu+%zd",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __entry->offset, __entry->count,
+ __entry->mem_align, __entry->offset_align,
+ __entry->start, __entry->start_len,
+ __entry->middle, __entry->middle_len,
+ __entry->end, __entry->end_len)
+)
+
+#define DEFINE_NFS_LOCAL_DIO_EVENT(name) \
+DEFINE_EVENT(nfs_local_dio_class, nfs_local_dio_##name, \
+ TP_PROTO(const struct inode *inode, \
+ loff_t offset, \
+ ssize_t count, \
+ const struct nfs_local_dio *local_dio),\
+ TP_ARGS(inode, offset, count, local_dio))
+
+DEFINE_NFS_LOCAL_DIO_EVENT(read);
+DEFINE_NFS_LOCAL_DIO_EVENT(write);
+DEFINE_NFS_LOCAL_DIO_EVENT(misaligned);
+
+#endif /* CONFIG_NFS_LOCALIO */
+
TRACE_EVENT(nfs_fh_to_dentry,
TP_PROTO(
const struct super_block *sb,
@@ -1713,10 +1916,10 @@ TRACE_EVENT(nfs_local_open_fh,
),
TP_printk(
- "error=%d fhandle=0x%08x mode=%s",
- __entry->error,
+ "fhandle=0x%08x mode=%s result=%d",
__entry->fhandle,
- show_fs_fmode_flags(__entry->fmode)
+ show_fs_fmode_flags(__entry->fmode),
+ __entry->error
)
);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 647c53d1418a..0fb6905736d5 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -296,7 +296,7 @@ static void nfs_folio_end_writeback(struct folio *folio)
{
struct nfs_server *nfss = NFS_SERVER(folio->mapping->host);
- folio_end_writeback(folio);
+ folio_end_writeback_no_dropbehind(folio);
if (atomic_long_dec_return(&nfss->writeback) <
NFS_CONGESTION_OFF_THRESH) {
nfss->write_congested = 0;
@@ -593,6 +593,7 @@ static int nfs_do_writepage(struct folio *folio, struct writeback_control *wbc,
if (IS_ERR(req))
return PTR_ERR(req);
+ trace_nfs_do_writepage(req);
nfs_folio_set_writeback(folio);
WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags));
@@ -656,12 +657,14 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
int priority = 0;
int err;
+ trace_nfs_writepages(inode, wbc->range_start, wbc->range_end - wbc->range_start);
+
/* Wait with writeback until write congestion eases */
if (wbc->sync_mode == WB_SYNC_NONE && nfss->write_congested) {
err = wait_event_killable(nfss->write_congestion_wait,
nfss->write_congested == 0);
if (err)
- return err;
+ goto out_err;
}
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
@@ -692,10 +695,10 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
} while (err < 0 && !nfs_error_is_fatal(err));
nfs_io_completion_put(ioc);
- if (err < 0)
- goto out_err;
- return 0;
+ if (err > 0)
+ err = 0;
out_err:
+ trace_nfs_writepages_done(inode, wbc->range_start, wbc->range_end - wbc->range_start, err);
return err;
}
@@ -745,6 +748,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
clear_bit(PG_MAPPED, &req->wb_head->wb_flags);
}
spin_unlock(&mapping->i_private_lock);
+
+ folio_end_dropbehind(folio);
}
nfs_page_group_unlock(req);
@@ -926,7 +931,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
req->wb_nio = 0;
memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf));
nfs_mark_request_commit(req, hdr->lseg, &cinfo,
- hdr->pgio_mirror_idx);
+ hdr->ds_commit_idx);
goto next;
}
remove_req:
@@ -1017,11 +1022,12 @@ static struct nfs_page *nfs_try_to_update_request(struct folio *folio,
unsigned int end;
int error;
+ trace_nfs_try_to_update_request(folio_inode(folio), offset, bytes);
end = offset + bytes;
req = nfs_lock_and_join_requests(folio);
if (IS_ERR_OR_NULL(req))
- return req;
+ goto out;
rqend = req->wb_offset + req->wb_bytes;
/*
@@ -1043,6 +1049,9 @@ static struct nfs_page *nfs_try_to_update_request(struct folio *folio,
else
req->wb_bytes = rqend - req->wb_offset;
req->wb_nio = 0;
+out:
+ trace_nfs_try_to_update_request_done(folio_inode(folio), offset, bytes,
+ PTR_ERR_OR_ZERO(req));
return req;
out_flushme:
/*
@@ -1053,6 +1062,7 @@ out_flushme:
nfs_mark_request_dirty(req);
nfs_unlock_and_release_request(req);
error = nfs_wb_folio(folio->mapping->host, folio);
+ trace_nfs_try_to_update_request_done(folio_inode(folio), offset, bytes, error);
return (error < 0) ? ERR_PTR(error) : NULL;
}
@@ -1090,6 +1100,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx,
req = nfs_setup_write_request(ctx, folio, offset, count);
if (IS_ERR(req))
return PTR_ERR(req);
+ trace_nfs_writepage_setup(req);
/* Update file length */
nfs_grow_file(folio, offset, count);
nfs_mark_uptodate(req);
@@ -1290,6 +1301,8 @@ int nfs_update_folio(struct file *file, struct folio *folio,
nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
+ trace_nfs_update_folio(inode, offset, count);
+
dprintk("NFS: nfs_update_folio(%pD2 %d@%lld)\n", file, count,
(long long)(folio_pos(folio) + offset));
@@ -1309,6 +1322,7 @@ int nfs_update_folio(struct file *file, struct folio *folio,
if (status < 0)
nfs_set_pageerror(mapping);
out:
+ trace_nfs_update_folio_done(inode, offset, count, status);
dprintk("NFS: nfs_update_folio returns %d (isize %lld)\n",
status, (long long)i_size_read(inode));
return status;
@@ -1806,7 +1820,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
nfs_mapping_set_error(folio, status);
nfs_inode_remove_request(req);
}
- dprintk_cont(", error = %d\n", status);
+ dprintk(", error = %d\n", status);
goto next;
}
@@ -1816,11 +1830,11 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
/* We have a match */
if (folio)
nfs_inode_remove_request(req);
- dprintk_cont(" OK\n");
+ dprintk(" OK\n");
goto next;
}
/* We have a mismatch. Write the page again */
- dprintk_cont(" mismatch\n");
+ dprintk(" mismatch\n");
nfs_mark_request_dirty(req);
atomic_long_inc(&NFS_I(data->inode)->redirtied_pages);
next:
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index cadfc2bae60e..caa695c06efb 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -402,7 +402,7 @@ static struct svc_export *svc_export_update(struct svc_export *new,
struct svc_export *old);
static struct svc_export *svc_export_lookup(struct svc_export *);
-static int check_export(struct path *path, int *flags, unsigned char *uuid)
+static int check_export(const struct path *path, int *flags, unsigned char *uuid)
{
struct inode *inode = d_inode(path->dentry);
@@ -1181,7 +1181,7 @@ denied:
* use exp_get_by_name() or exp_find().
*/
struct svc_export *
-rqst_exp_get_by_name(struct svc_rqst *rqstp, struct path *path)
+rqst_exp_get_by_name(struct svc_rqst *rqstp, const struct path *path)
{
struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT);
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index b9c0adb3ce09..cb36e6cce829 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -111,7 +111,7 @@ int nfsd_export_init(struct net *);
void nfsd_export_shutdown(struct net *);
void nfsd_export_flush(struct net *);
struct svc_export * rqst_exp_get_by_name(struct svc_rqst *,
- struct path *);
+ const struct path *);
struct svc_export * rqst_exp_parent(struct svc_rqst *,
struct path *);
struct svc_export * rqst_find_fsidzero_export(struct svc_rqst *);
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 85ca663c052c..e010d90aeb27 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -231,6 +231,9 @@ nfsd_file_alloc(struct net *net, struct inode *inode, unsigned char need,
refcount_set(&nf->nf_ref, 1);
nf->nf_may = need;
nf->nf_mark = NULL;
+ nf->nf_dio_mem_align = 0;
+ nf->nf_dio_offset_align = 0;
+ nf->nf_dio_read_offset_align = 0;
return nf;
}
@@ -1070,6 +1073,35 @@ nfsd_file_is_cached(struct inode *inode)
}
static __be32
+nfsd_file_get_dio_attrs(const struct svc_fh *fhp, struct nfsd_file *nf)
+{
+ struct inode *inode = file_inode(nf->nf_file);
+ struct kstat stat;
+ __be32 status;
+
+ /* Currently only need to get DIO alignment info for regular files */
+ if (!S_ISREG(inode->i_mode))
+ return nfs_ok;
+
+ status = fh_getattr(fhp, &stat);
+ if (status != nfs_ok)
+ return status;
+
+ trace_nfsd_file_get_dio_attrs(inode, &stat);
+
+ if (stat.result_mask & STATX_DIOALIGN) {
+ nf->nf_dio_mem_align = stat.dio_mem_align;
+ nf->nf_dio_offset_align = stat.dio_offset_align;
+ }
+ if (stat.result_mask & STATX_DIO_READ_ALIGN)
+ nf->nf_dio_read_offset_align = stat.dio_read_offset_align;
+ else
+ nf->nf_dio_read_offset_align = nf->nf_dio_offset_align;
+
+ return nfs_ok;
+}
+
+static __be32
nfsd_file_do_acquire(struct svc_rqst *rqstp, struct net *net,
struct svc_cred *cred,
struct auth_domain *client,
@@ -1187,6 +1219,8 @@ open_file:
}
status = nfserrno(ret);
trace_nfsd_file_open(nf, status);
+ if (status == nfs_ok)
+ status = nfsd_file_get_dio_attrs(fhp, nf);
}
} else
status = nfserr_jukebox;
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index 722b26c71e45..237a05c74211 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -54,6 +54,10 @@ struct nfsd_file {
struct list_head nf_gc;
struct rcu_head nf_rcu;
ktime_t nf_birthtime;
+
+ u32 nf_dio_mem_align;
+ u32 nf_dio_offset_align;
+ u32 nf_dio_read_offset_align;
};
int nfsd_file_cache_init(void);
diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
index cb237f1b902a..9e0a37cd29d8 100644
--- a/fs/nfsd/localio.c
+++ b/fs/nfsd/localio.c
@@ -117,6 +117,16 @@ nfsd_open_local_fh(struct net *net, struct auth_domain *dom,
return localio;
}
+static void nfsd_file_dio_alignment(struct nfsd_file *nf,
+ u32 *nf_dio_mem_align,
+ u32 *nf_dio_offset_align,
+ u32 *nf_dio_read_offset_align)
+{
+ *nf_dio_mem_align = nf->nf_dio_mem_align;
+ *nf_dio_offset_align = nf->nf_dio_offset_align;
+ *nf_dio_read_offset_align = nf->nf_dio_read_offset_align;
+}
+
static const struct nfsd_localio_operations nfsd_localio_ops = {
.nfsd_net_try_get = nfsd_net_try_get,
.nfsd_net_put = nfsd_net_put,
@@ -124,6 +134,7 @@ static const struct nfsd_localio_operations nfsd_localio_ops = {
.nfsd_file_put_local = nfsd_file_put_local,
.nfsd_file_get_local = nfsd_file_get_local,
.nfsd_file_file = nfsd_file_file,
+ .nfsd_file_dio_alignment = nfsd_file_dio_alignment,
};
void nfsd_localio_ops_init(void)
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index bc6b776fc657..2b79129703d5 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1103,89 +1103,48 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
* populating the filesystem.
*/
-/* Basically copying rpc_get_inode. */
static struct inode *nfsd_get_inode(struct super_block *sb, umode_t mode)
{
struct inode *inode = new_inode(sb);
- if (!inode)
- return NULL;
- /* Following advice from simple_fill_super documentation: */
- inode->i_ino = iunique(sb, NFSD_MaxReserved);
- inode->i_mode = mode;
- simple_inode_init_ts(inode);
- switch (mode & S_IFMT) {
- case S_IFDIR:
- inode->i_fop = &simple_dir_operations;
- inode->i_op = &simple_dir_inode_operations;
- inc_nlink(inode);
- break;
- case S_IFLNK:
- inode->i_op = &simple_symlink_inode_operations;
- break;
- default:
- break;
+ if (inode) {
+ /* Following advice from simple_fill_super documentation: */
+ inode->i_ino = iunique(sb, NFSD_MaxReserved);
+ inode->i_mode = mode;
+ simple_inode_init_ts(inode);
}
return inode;
}
-static int __nfsd_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode, struct nfsdfs_client *ncl)
+static struct dentry *nfsd_mkdir(struct dentry *parent, struct nfsdfs_client *ncl, char *name)
{
+ struct inode *dir = parent->d_inode;
+ struct dentry *dentry;
struct inode *inode;
- inode = nfsd_get_inode(dir->i_sb, mode);
+ inode = nfsd_get_inode(parent->d_sb, S_IFDIR | 0600);
if (!inode)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
+
+ dentry = simple_start_creating(parent, name);
+ if (IS_ERR(dentry)) {
+ iput(inode);
+ return dentry;
+ }
+ inode->i_fop = &simple_dir_operations;
+ inode->i_op = &simple_dir_inode_operations;
+ inc_nlink(inode);
if (ncl) {
inode->i_private = ncl;
kref_get(&ncl->cl_ref);
}
- d_add(dentry, inode);
+ d_instantiate(dentry, inode);
inc_nlink(dir);
fsnotify_mkdir(dir, dentry);
- return 0;
-}
-
-static struct dentry *nfsd_mkdir(struct dentry *parent, struct nfsdfs_client *ncl, char *name)
-{
- struct inode *dir = parent->d_inode;
- struct dentry *dentry;
- int ret = -ENOMEM;
-
- inode_lock(dir);
- dentry = d_alloc_name(parent, name);
- if (!dentry)
- goto out_err;
- ret = __nfsd_mkdir(d_inode(parent), dentry, S_IFDIR | 0600, ncl);
- if (ret)
- goto out_err;
-out:
inode_unlock(dir);
return dentry;
-out_err:
- dput(dentry);
- dentry = ERR_PTR(ret);
- goto out;
}
#if IS_ENABLED(CONFIG_SUNRPC_GSS)
-static int __nfsd_symlink(struct inode *dir, struct dentry *dentry,
- umode_t mode, const char *content)
-{
- struct inode *inode;
-
- inode = nfsd_get_inode(dir->i_sb, mode);
- if (!inode)
- return -ENOMEM;
-
- inode->i_link = (char *)content;
- inode->i_size = strlen(content);
-
- d_add(dentry, inode);
- inc_nlink(dir);
- fsnotify_create(dir, dentry);
- return 0;
-}
-
/*
* @content is assumed to be a NUL-terminated string that lives
* longer than the symlink itself.
@@ -1194,17 +1153,25 @@ static void _nfsd_symlink(struct dentry *parent, const char *name,
const char *content)
{
struct inode *dir = parent->d_inode;
+ struct inode *inode;
struct dentry *dentry;
- int ret;
- inode_lock(dir);
- dentry = d_alloc_name(parent, name);
- if (!dentry)
- goto out;
- ret = __nfsd_symlink(d_inode(parent), dentry, S_IFLNK | 0777, content);
- if (ret)
- dput(dentry);
-out:
+ inode = nfsd_get_inode(dir->i_sb, S_IFLNK | 0777);
+ if (!inode)
+ return;
+
+ dentry = simple_start_creating(parent, name);
+ if (IS_ERR(dentry)) {
+ iput(inode);
+ return;
+ }
+
+ inode->i_op = &simple_symlink_inode_operations;
+ inode->i_link = (char *)content;
+ inode->i_size = strlen(content);
+
+ d_instantiate(dentry, inode);
+ fsnotify_create(dir, dentry);
inode_unlock(dir);
}
#else
@@ -1240,40 +1207,34 @@ struct nfsdfs_client *get_nfsdfs_client(struct inode *inode)
/* XXX: cut'n'paste from simple_fill_super; figure out if we could share
* code instead. */
-static int nfsdfs_create_files(struct dentry *root,
+static int nfsdfs_create_files(struct dentry *root,
const struct tree_descr *files,
struct nfsdfs_client *ncl,
struct dentry **fdentries)
{
struct inode *dir = d_inode(root);
- struct inode *inode;
struct dentry *dentry;
- int i;
- inode_lock(dir);
- for (i = 0; files->name && files->name[0]; i++, files++) {
- dentry = d_alloc_name(root, files->name);
- if (!dentry)
- goto out;
- inode = nfsd_get_inode(d_inode(root)->i_sb,
- S_IFREG | files->mode);
- if (!inode) {
- dput(dentry);
- goto out;
+ for (int i = 0; files->name && files->name[0]; i++, files++) {
+ struct inode *inode = nfsd_get_inode(root->d_sb,
+ S_IFREG | files->mode);
+ if (!inode)
+ return -ENOMEM;
+ dentry = simple_start_creating(root, files->name);
+ if (IS_ERR(dentry)) {
+ iput(inode);
+ return PTR_ERR(dentry);
}
kref_get(&ncl->cl_ref);
inode->i_fop = files->ops;
inode->i_private = ncl;
- d_add(dentry, inode);
+ d_instantiate(dentry, inode);
fsnotify_create(dir, dentry);
if (fdentries)
fdentries[i] = dentry;
+ inode_unlock(dir);
}
- inode_unlock(dir);
return 0;
-out:
- inode_unlock(dir);
- return -ENOMEM;
}
/* on success, returns positive number unique to that client. */
@@ -1993,7 +1954,7 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info)
* remaining listeners and recreate the list.
*/
if (delete)
- svc_xprt_destroy_all(serv, net);
+ svc_xprt_destroy_all(serv, net, false);
/* walk list of addrs again, open any that still don't exist */
nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_SOCK_ADDR, info->nlhdr,
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 82b0111ac469..7057ddd7a0a8 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -535,16 +535,13 @@ void nfsd_destroy_serv(struct net *net)
#endif
}
- svc_xprt_destroy_all(serv, net);
-
/*
* write_ports can create the server without actually starting
- * any threads--if we get shut down before any threads are
+ * any threads. If we get shut down before any threads are
* started, then nfsd_destroy_serv will be run before any of this
* other initialization has been done except the rpcb information.
*/
- svc_rpcb_cleanup(serv, net);
-
+ svc_xprt_destroy_all(serv, net, true);
nfsd_shutdown_net(net);
svc_destroy(&serv);
}
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index a664fdf1161e..6e2c8e2aab10 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -1133,6 +1133,33 @@ TRACE_EVENT(nfsd_file_alloc,
)
);
+TRACE_EVENT(nfsd_file_get_dio_attrs,
+ TP_PROTO(
+ const struct inode *inode,
+ const struct kstat *stat
+ ),
+ TP_ARGS(inode, stat),
+ TP_STRUCT__entry(
+ __field(const void *, inode)
+ __field(unsigned long, mask)
+ __field(u32, mem_align)
+ __field(u32, offset_align)
+ __field(u32, read_offset_align)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ __entry->mask = stat->result_mask;
+ __entry->mem_align = stat->dio_mem_align;
+ __entry->offset_align = stat->dio_offset_align;
+ __entry->read_offset_align = stat->dio_read_offset_align;
+ ),
+ TP_printk("inode=%p flags=%s mem_align=%u offset_align=%u read_offset_align=%u",
+ __entry->inode, show_statx_mask(__entry->mask),
+ __entry->mem_align, __entry->offset_align,
+ __entry->read_offset_align
+ )
+);
+
TRACE_EVENT(nfsd_file_acquire,
TP_PROTO(
const struct svc_rqst *rqstp,
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index eff04959606f..fde3e0c11dba 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -185,6 +185,10 @@ static inline __be32 fh_getattr(const struct svc_fh *fh, struct kstat *stat)
u32 request_mask = STATX_BASIC_STATS;
struct path p = {.mnt = fh->fh_export->ex_path.mnt,
.dentry = fh->fh_dentry};
+ struct inode *inode = d_inode(p.dentry);
+
+ if (S_ISREG(inode->i_mode))
+ request_mask |= (STATX_DIOALIGN | STATX_DIO_READ_ALIGN);
if (fh->fh_maxsize == NFS4_FHSIZE)
request_mask |= (STATX_BTIME | STATX_CHANGE_COOKIE);
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index b78308975082..39e60218df7c 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -441,7 +441,9 @@ struct fanotify_perm_event {
size_t count;
u32 response; /* userspace answer to the event */
unsigned short state; /* state of the event */
+ unsigned short watchdog_cnt; /* already scanned by watchdog? */
int fd; /* fd we passed to userspace for this event */
+ pid_t recv_pid; /* pid of task receiving the event */
union {
struct fanotify_response_info_header hdr;
struct fanotify_response_info_audit_rule audit_rule;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index b192ee068a7a..1dadda82cae5 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -50,6 +50,7 @@
/* configurable via /proc/sys/fs/fanotify/ */
static int fanotify_max_queued_events __read_mostly;
+static int perm_group_timeout __read_mostly;
#ifdef CONFIG_SYSCTL
@@ -85,6 +86,14 @@ static const struct ctl_table fanotify_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO
},
+ {
+ .procname = "watchdog_timeout",
+ .data = &perm_group_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
};
static void __init fanotify_sysctls_init(void)
@@ -95,6 +104,91 @@ static void __init fanotify_sysctls_init(void)
#define fanotify_sysctls_init() do { } while (0)
#endif /* CONFIG_SYSCTL */
+static LIST_HEAD(perm_group_list);
+static DEFINE_SPINLOCK(perm_group_lock);
+static void perm_group_watchdog(struct work_struct *work);
+static DECLARE_DELAYED_WORK(perm_group_work, perm_group_watchdog);
+
+static void perm_group_watchdog_schedule(void)
+{
+ schedule_delayed_work(&perm_group_work, secs_to_jiffies(perm_group_timeout));
+}
+
+static void perm_group_watchdog(struct work_struct *work)
+{
+ struct fsnotify_group *group;
+ struct fanotify_perm_event *event;
+ struct task_struct *task;
+ pid_t failed_pid = 0;
+
+ guard(spinlock)(&perm_group_lock);
+ if (list_empty(&perm_group_list))
+ return;
+
+ list_for_each_entry(group, &perm_group_list,
+ fanotify_data.perm_grp_list) {
+ /*
+ * Ok to test without lock, racing with an addition is
+ * fine, will deal with it next round
+ */
+ if (list_empty(&group->fanotify_data.access_list))
+ continue;
+
+ spin_lock(&group->notification_lock);
+ list_for_each_entry(event, &group->fanotify_data.access_list,
+ fae.fse.list) {
+ if (likely(event->watchdog_cnt == 0)) {
+ event->watchdog_cnt = 1;
+ } else if (event->watchdog_cnt == 1) {
+ /* Report on event only once */
+ event->watchdog_cnt = 2;
+
+ /* Do not report same pid repeatedly */
+ if (event->recv_pid == failed_pid)
+ continue;
+
+ failed_pid = event->recv_pid;
+ rcu_read_lock();
+ task = find_task_by_pid_ns(event->recv_pid,
+ &init_pid_ns);
+ pr_warn_ratelimited(
+ "PID %u (%s) failed to respond to fanotify queue for more than %d seconds\n",
+ event->recv_pid,
+ task ? task->comm : NULL,
+ perm_group_timeout);
+ rcu_read_unlock();
+ }
+ }
+ spin_unlock(&group->notification_lock);
+ }
+ perm_group_watchdog_schedule();
+}
+
+static void fanotify_perm_watchdog_group_remove(struct fsnotify_group *group)
+{
+ if (!list_empty(&group->fanotify_data.perm_grp_list)) {
+ /* Perm event watchdog can no longer scan this group. */
+ spin_lock(&perm_group_lock);
+ list_del_init(&group->fanotify_data.perm_grp_list);
+ spin_unlock(&perm_group_lock);
+ }
+}
+
+static void fanotify_perm_watchdog_group_add(struct fsnotify_group *group)
+{
+ if (!perm_group_timeout)
+ return;
+
+ spin_lock(&perm_group_lock);
+ if (list_empty(&group->fanotify_data.perm_grp_list)) {
+ /* Add to perm_group_list for monitoring by watchdog. */
+ if (list_empty(&perm_group_list))
+ perm_group_watchdog_schedule();
+ list_add_tail(&group->fanotify_data.perm_grp_list, &perm_group_list);
+ }
+ spin_unlock(&perm_group_lock);
+}
+
/*
* All flags that may be specified in parameter event_f_flags of fanotify_init.
*
@@ -953,6 +1047,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
spin_lock(&group->notification_lock);
list_add_tail(&event->fse.list,
&group->fanotify_data.access_list);
+ FANOTIFY_PERM(event)->recv_pid = current->pid;
spin_unlock(&group->notification_lock);
}
}
@@ -1012,6 +1107,8 @@ static int fanotify_release(struct inode *ignored, struct file *file)
*/
fsnotify_group_stop_queueing(group);
+ fanotify_perm_watchdog_group_remove(group);
+
/*
* Process all permission events on access_list and notification queue
* and simulate reply from userspace.
@@ -1465,6 +1562,10 @@ out:
fsnotify_group_unlock(group);
fsnotify_put_mark(fsn_mark);
+
+ if (!ret && (mask & FANOTIFY_PERM_EVENTS))
+ fanotify_perm_watchdog_group_add(group);
+
return ret;
}
@@ -1625,6 +1726,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
group->fanotify_data.f_flags = event_f_flags;
init_waitqueue_head(&group->fanotify_data.access_waitq);
INIT_LIST_HEAD(&group->fanotify_data.access_list);
+ INIT_LIST_HEAD(&group->fanotify_data.perm_grp_list);
switch (class) {
case FAN_CLASS_NOTIF:
group->priority = FSNOTIFY_PRIO_NORMAL;
@@ -1999,7 +2101,10 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
user_ns = path.mnt->mnt_sb->s_user_ns;
obj = path.mnt->mnt_sb;
} else if (obj_type == FSNOTIFY_OBJ_TYPE_MNTNS) {
+ ret = -EINVAL;
mntns = mnt_ns_from_dentry(path.dentry);
+ if (!mntns)
+ goto path_put_and_out;
user_ns = mntns->user_ns;
obj = mntns;
}
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index cd7d11b0eb08..7c326ec2e8a8 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -10,7 +10,7 @@
* Copyright 2006 Hewlett-Packard Development Company, L.P.
*
* Copyright (C) 2009 Eric Paris <Red Hat Inc>
- * inotify was largely rewriten to make use of the fsnotify infrastructure
+ * inotify was largely rewritten to make use of the fsnotify infrastructure
*/
#include <linux/dcache.h> /* d_unlinked */
diff --git a/fs/nsfs.c b/fs/nsfs.c
index e7fd8a790aaa..648dc59bef7f 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -571,7 +571,7 @@ static int nsfs_export_permission(struct handle_to_path_ctx *ctx,
return 0;
}
-static struct file *nsfs_export_open(struct path *path, unsigned int oflags)
+static struct file *nsfs_export_open(const struct path *path, unsigned int oflags)
{
return file_open_root(path, "", oflags, 0);
}
diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c
index 04107b950717..65d05e6a0566 100644
--- a/fs/ntfs3/bitmap.c
+++ b/fs/ntfs3/bitmap.c
@@ -1371,6 +1371,7 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits)
mark_buffer_dirty(bh);
unlock_buffer(bh);
/* err = sync_dirty_buffer(bh); */
+ put_bh(bh);
b0 = 0;
bits -= op;
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index c1ece707b195..4c90ec2fa2ea 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -49,6 +49,30 @@ static int ntfs_ioctl_fitrim(struct ntfs_sb_info *sbi, unsigned long arg)
return 0;
}
+static int ntfs_ioctl_get_volume_label(struct ntfs_sb_info *sbi, u8 __user *buf)
+{
+ if (copy_to_user(buf, sbi->volume.label, FSLABEL_MAX))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int ntfs_ioctl_set_volume_label(struct ntfs_sb_info *sbi, u8 __user *buf)
+{
+ u8 user[FSLABEL_MAX] = {0};
+ int len;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(user, buf, FSLABEL_MAX))
+ return -EFAULT;
+
+ len = strnlen(user, FSLABEL_MAX);
+
+ return ntfs_set_label(sbi, user, len);
+}
+
/*
* ntfs_ioctl - file_operations::unlocked_ioctl
*/
@@ -64,6 +88,10 @@ long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg)
switch (cmd) {
case FITRIM:
return ntfs_ioctl_fitrim(sbi, arg);
+ case FS_IOC_GETFSLABEL:
+ return ntfs_ioctl_get_volume_label(sbi, (u8 __user *)arg);
+ case FS_IOC_SETFSLABEL:
+ return ntfs_ioctl_set_volume_label(sbi, (u8 __user *)arg);
}
return -ENOTTY; /* Inappropriate ioctl for device. */
}
diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c
index 1bf2a6593dec..6d1bf890929d 100644
--- a/fs/ntfs3/index.c
+++ b/fs/ntfs3/index.c
@@ -1508,6 +1508,16 @@ static int indx_add_allocate(struct ntfs_index *indx, struct ntfs_inode *ni,
bmp_size = bmp_size_v = le32_to_cpu(bmp->res.data_size);
}
+ /*
+ * Index blocks exist, but $BITMAP has zero valid bits.
+ * This implies an on-disk corruption and must be rejected.
+ */
+ if (in->name == I30_NAME &&
+ unlikely(bmp_size_v == 0 && indx->alloc_run.count)) {
+ err = -EINVAL;
+ goto out1;
+ }
+
bit = bmp_size << 3;
}
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index 48b4f73a93ee..3959f23c487a 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -471,6 +471,7 @@ end_enum:
fname->home.seq == cpu_to_le16(MFT_REC_EXTEND)) {
/* Records in $Extend are not a files or general directories. */
inode->i_op = &ntfs_file_inode_operations;
+ mode = S_IFREG;
} else {
err = -EINVAL;
goto out;
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index 1296e6fcc779..630128716ea7 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -280,7 +280,7 @@ struct ntfs_sb_info {
__le16 flags; // Cached current VOLUME_INFO::flags, VOLUME_FLAG_DIRTY.
u8 major_ver;
u8 minor_ver;
- char label[256];
+ char label[FSLABEL_MAX];
bool real_dirty; // Real fs state.
} volume;
diff --git a/fs/ntfs3/run.c b/fs/ntfs3/run.c
index 6e86d66197ef..88550085f745 100644
--- a/fs/ntfs3/run.c
+++ b/fs/ntfs3/run.c
@@ -9,6 +9,7 @@
#include <linux/blkdev.h>
#include <linux/fs.h>
#include <linux/log2.h>
+#include <linux/overflow.h>
#include "debug.h"
#include "ntfs.h"
@@ -982,14 +983,18 @@ int run_unpack(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino,
if (!dlcn)
return -EINVAL;
- lcn = prev_lcn + dlcn;
+
+ if (check_add_overflow(prev_lcn, dlcn, &lcn))
+ return -EINVAL;
prev_lcn = lcn;
} else {
/* The size of 'dlcn' can't be > 8. */
return -EINVAL;
}
- next_vcn = vcn64 + len;
+ if (check_add_overflow(vcn64, len, &next_vcn))
+ return -EINVAL;
+
/* Check boundary. */
if (next_vcn > evcn + 1)
return -EINVAL;
@@ -1153,7 +1158,8 @@ int run_get_highest_vcn(CLST vcn, const u8 *run_buf, u64 *highest_vcn)
return -EINVAL;
run_buf += size_size + offset_size;
- vcn64 += len;
+ if (check_add_overflow(vcn64, len, &vcn64))
+ return -EINVAL;
#ifndef CONFIG_NTFS3_64BIT_CLUSTER
if (vcn64 > 0x100000000ull)
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 821cb7874685..162711cc5b20 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6928,8 +6928,7 @@ static int ocfs2_grab_folios(struct inode *inode, loff_t start, loff_t end,
out:
if (ret != 0) {
- if (folios)
- ocfs2_unlock_and_free_folios(folios, numfolios);
+ ocfs2_unlock_and_free_folios(folios, numfolios);
numfolios = 0;
}
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 86bb1a03bcc1..4145e06d2c08 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1477,7 +1477,6 @@ way_up_top:
goto send_response;
} else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
spin_unlock(&res->spinlock);
- // mlog(0, "node %u is the master\n", res->owner);
response = DLM_MASTER_RESP_NO;
if (mle)
kmem_cache_free(dlm_mle_cache, mle);
@@ -1493,7 +1492,6 @@ way_up_top:
BUG();
}
- // mlog(0, "lockres is in progress...\n");
spin_lock(&dlm->master_lock);
found = dlm_find_mle(dlm, &tmpmle, name, namelen);
if (!found) {
@@ -1503,8 +1501,6 @@ way_up_top:
set_maybe = 1;
spin_lock(&tmpmle->spinlock);
if (tmpmle->type == DLM_MLE_BLOCK) {
- // mlog(0, "this node is waiting for "
- // "lockres to be mastered\n");
response = DLM_MASTER_RESP_NO;
} else if (tmpmle->type == DLM_MLE_MIGRATION) {
mlog(0, "node %u is master, but trying to migrate to "
@@ -1531,8 +1527,6 @@ way_up_top:
} else
response = DLM_MASTER_RESP_NO;
} else {
- // mlog(0, "this node is attempting to "
- // "master lockres\n");
response = DLM_MASTER_RESP_MAYBE;
}
if (set_maybe)
@@ -1559,7 +1553,6 @@ way_up_top:
found = dlm_find_mle(dlm, &tmpmle, name, namelen);
if (!found) {
/* this lockid has never been seen on this node yet */
- // mlog(0, "no mle found\n");
if (!mle) {
spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
@@ -1573,8 +1566,6 @@ way_up_top:
goto way_up_top;
}
- // mlog(0, "this is second time thru, already allocated, "
- // "add the block.\n");
dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen);
set_bit(request->node_idx, mle->maybe_map);
__dlm_insert_mle(dlm, mle);
@@ -1897,8 +1888,6 @@ ok:
spin_unlock(&res->spinlock);
}
- // mlog(0, "woo! got an assert_master from node %u!\n",
- // assert->node_idx);
if (mle) {
int extra_ref = 0;
int nn = -1;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 00f52812dbb0..843ee02bd85f 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -464,7 +464,6 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
}
if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
- // mlog(0, "nothing to recover! sleeping now!\n");
spin_unlock(&dlm->spinlock);
/* return to main thread loop and sleep. */
return 0;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 6c4f78f473fb..fcc89856ab95 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1495,6 +1495,14 @@ int ocfs2_validate_inode_block(struct super_block *sb,
goto bail;
}
+ if (le16_to_cpu(di->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT &&
+ (u32)le16_to_cpu(di->i_suballoc_slot) > OCFS2_SB(sb)->max_slots - 1) {
+ rc = ocfs2_error(sb, "Invalid dinode %llu: suballoc slot %u\n",
+ (unsigned long long)bh->b_blocknr,
+ le16_to_cpu(di->i_suballoc_slot));
+ goto bail;
+ }
+
rc = 0;
bail:
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index db14c92302a1..b6864602814c 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -358,13 +358,11 @@ static int ocfs2_info_handle_freeinode(struct inode *inode,
goto bail;
}
} else {
- ocfs2_sprintf_system_inode_name(namebuf,
- sizeof(namebuf),
- type, i);
+ int len = ocfs2_sprintf_system_inode_name(namebuf,
+ sizeof(namebuf),
+ type, i);
status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
- namebuf,
- strlen(namebuf),
- &blkno);
+ namebuf, len, &blkno);
if (status < 0) {
status = -ENOENT;
goto bail;
@@ -651,12 +649,10 @@ static int ocfs2_info_handle_freefrag(struct inode *inode,
goto bail;
}
} else {
- ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type,
- OCFS2_INVALID_SLOT);
+ int len = ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf),
+ type, OCFS2_INVALID_SLOT);
status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
- namebuf,
- strlen(namebuf),
- &blkno);
+ namebuf, len, &blkno);
if (status < 0) {
status = -ENOENT;
goto bail;
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index cbe2f8ed8897..86f2631e6360 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -364,7 +364,7 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode,
int *vict_bit,
struct buffer_head **ret_bh)
{
- int ret, i, bits_per_unit = 0;
+ int ret, i, len, bits_per_unit = 0;
u64 blkno;
char namebuf[40];
@@ -375,9 +375,9 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode,
struct ocfs2_dinode *ac_dinode;
struct ocfs2_group_desc *bg;
- ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
- ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
- strlen(namebuf), &blkno);
+ len = ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
+ ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, len, &blkno);
+
if (ret) {
ret = -ENOENT;
goto out;
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index e8e94599e907..ae0e44e5f2ad 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -614,7 +614,7 @@ struct ocfs2_super_block {
__le16 s_reserved0;
__le32 s_dx_seed[3]; /* seed[0-2] for dx dir hash.
* s_uuid_hash serves as seed[3]. */
-/*C0*/ __le64 s_reserved2[15]; /* Fill out superblock */
+/*C8*/ __le64 s_reserved2[15]; /* Fill out superblock */
/*140*/
/*
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 765105f1ff8a..be0a5758bd40 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -1011,6 +1011,7 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
printk(KERN_ERR "ocfs2: Could not determine"
" locking version\n");
user_cluster_disconnect(conn);
+ lc = NULL;
goto out;
}
wait_event(lc->oc_wait, (atomic_read(&lc->oc_this_node) > 0));
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index 53a945da873b..d53a6cc866be 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -127,14 +127,14 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
char namebuf[40];
struct inode *inode = NULL;
u64 blkno;
- int status = 0;
+ int len, status = 0;
- ocfs2_sprintf_system_inode_name(namebuf,
- sizeof(namebuf),
- type, slot);
+ len = ocfs2_sprintf_system_inode_name(namebuf,
+ sizeof(namebuf),
+ type, slot);
- status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
- strlen(namebuf), &blkno);
+ status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
+ namebuf, len, &blkno);
if (status < 0) {
goto bail;
}
diff --git a/fs/open.c b/fs/open.c
index 9655158c3885..3d64372ecc67 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1022,8 +1022,8 @@ cleanup_all:
put_file_access(f);
cleanup_file:
path_put(&f->f_path);
- f->f_path.mnt = NULL;
- f->f_path.dentry = NULL;
+ f->__f_path.mnt = NULL;
+ f->__f_path.dentry = NULL;
f->f_inode = NULL;
return error;
}
@@ -1050,7 +1050,7 @@ int finish_open(struct file *file, struct dentry *dentry,
{
BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */
- file->f_path.dentry = dentry;
+ file->__f_path.dentry = dentry;
return do_dentry_open(file, open);
}
EXPORT_SYMBOL(finish_open);
@@ -1059,19 +1059,21 @@ EXPORT_SYMBOL(finish_open);
* finish_no_open - finish ->atomic_open() without opening the file
*
* @file: file pointer
- * @dentry: dentry or NULL (as returned from ->lookup())
+ * @dentry: dentry, ERR_PTR(-E...) or NULL (as returned from ->lookup())
*
- * This can be used to set the result of a successful lookup in ->atomic_open().
+ * This can be used to set the result of a lookup in ->atomic_open().
*
* NB: unlike finish_open() this function does consume the dentry reference and
* the caller need not dput() it.
*
- * Returns "0" which must be the return value of ->atomic_open() after having
- * called this function.
+ * Returns 0 or -E..., which must be the return value of ->atomic_open() after
+ * having called this function.
*/
int finish_no_open(struct file *file, struct dentry *dentry)
{
- file->f_path.dentry = dentry;
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ file->__f_path.dentry = dentry;
return 0;
}
EXPORT_SYMBOL(finish_no_open);
@@ -1091,7 +1093,7 @@ int vfs_open(const struct path *path, struct file *file)
{
int ret;
- file->f_path = *path;
+ file->__f_path = *path;
ret = do_dentry_open(file, NULL);
if (!ret) {
/*
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index 82395fe2b956..bec5475de094 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -38,8 +38,7 @@ static int orangefs_create(struct mnt_idmap *idmap,
new_op->upcall.req.create.parent_refn = parent->refn;
- fill_default_sys_attrs(new_op->upcall.req.create.attributes,
- ORANGEFS_TYPE_METAFILE, mode);
+ fill_default_sys_attrs(new_op->upcall.req.create.attributes, mode);
strscpy(new_op->upcall.req.create.d_name, dentry->d_name.name);
@@ -240,9 +239,7 @@ static int orangefs_symlink(struct mnt_idmap *idmap,
new_op->upcall.req.sym.parent_refn = parent->refn;
- fill_default_sys_attrs(new_op->upcall.req.sym.attributes,
- ORANGEFS_TYPE_SYMLINK,
- mode);
+ fill_default_sys_attrs(new_op->upcall.req.sym.attributes, mode);
strscpy(new_op->upcall.req.sym.entry_name, dentry->d_name.name);
strscpy(new_op->upcall.req.sym.target, symname);
@@ -316,8 +313,7 @@ static struct dentry *orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
new_op->upcall.req.mkdir.parent_refn = parent->refn;
- fill_default_sys_attrs(new_op->upcall.req.mkdir.attributes,
- ORANGEFS_TYPE_DIRECTORY, mode);
+ fill_default_sys_attrs(new_op->upcall.req.mkdir.attributes, mode);
strscpy(new_op->upcall.req.mkdir.d_name, dentry->d_name.name);
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index 1c375fb65018..79267b3419f2 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -440,14 +440,13 @@ static ssize_t orangefs_debug_write(struct file *file,
count = ORANGEFS_MAX_DEBUG_STRING_LEN;
}
- buf = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
- if (!buf)
- goto out;
-
- if (copy_from_user(buf, ubuf, count - 1)) {
+ buf = memdup_user_nul(ubuf, count - 1);
+ if (IS_ERR(buf)) {
gossip_debug(GOSSIP_DEBUGFS_DEBUG,
- "%s: copy_from_user failed!\n",
+ "%s: memdup_user_nul failed!\n",
__func__);
+ rc = PTR_ERR(buf);
+ buf = NULL;
goto out;
}
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 3e153c2f6b82..29c6da43e396 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -462,7 +462,7 @@ int service_operation(struct orangefs_kernel_op_s *op,
((ORANGEFS_SB(inode->i_sb)->flags & ORANGEFS_OPT_INTR) ? \
ORANGEFS_OP_INTERRUPTIBLE : 0)
-#define fill_default_sys_attrs(sys_attr, type, mode) \
+#define fill_default_sys_attrs(sys_attr, mode) \
do { \
sys_attr.owner = from_kuid(&init_user_ns, current_fsuid()); \
sys_attr.group = from_kgid(&init_user_ns, current_fsgid()); \
diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c
index 74ef75586f38..eee3c5ed1bbb 100644
--- a/fs/orangefs/xattr.c
+++ b/fs/orangefs/xattr.c
@@ -54,7 +54,9 @@ static inline int convert_to_internal_xattr_flags(int setxattr_flags)
static unsigned int xattr_key(const char *key)
{
unsigned int i = 0;
- while (key)
+ if (!key)
+ return 0;
+ while (*key)
i += *key++;
return i % 16;
}
@@ -175,8 +177,8 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name,
cx->length = -1;
cx->timeout = jiffies +
orangefs_getattr_timeout_msecs*HZ/1000;
- hash_add(orangefs_inode->xattr_cache, &cx->node,
- xattr_key(cx->key));
+ hlist_add_head( &cx->node,
+ &orangefs_inode->xattr_cache[xattr_key(cx->key)]);
}
}
goto out_release_op;
@@ -229,8 +231,8 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name,
memcpy(cx->val, buffer, length);
cx->length = length;
cx->timeout = jiffies + HZ;
- hash_add(orangefs_inode->xattr_cache, &cx->node,
- xattr_key(cx->key));
+ hlist_add_head(&cx->node,
+ &orangefs_inode->xattr_cache[xattr_key(cx->key)]);
}
}
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 27396fe63f6d..aac7e34f56c1 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -242,7 +242,7 @@ static int ovl_verify_area(loff_t pos, loff_t pos2, loff_t len, loff_t totlen)
return 0;
}
-static int ovl_sync_file(struct path *path)
+static int ovl_sync_file(const struct path *path)
{
struct file *new_file;
int err;
@@ -670,7 +670,7 @@ static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp)
if (err)
return err;
- if (inode->i_flags & OVL_COPY_I_FLAGS_MASK &&
+ if (inode->i_flags & OVL_FATTR_I_FLAGS_MASK &&
(S_ISREG(c->stat.mode) || S_ISDIR(c->stat.mode))) {
/*
* Copy the fileattr inode flags that are the source of already
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index dbd63a74df4b..a5e9ddf3023b 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -187,6 +187,13 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct dentry *parent,
/* mkdir is special... */
newdentry = ovl_do_mkdir(ofs, dir, newdentry, attr->mode);
err = PTR_ERR_OR_ZERO(newdentry);
+ /* expect to inherit casefolding from workdir/upperdir */
+ if (!err && ofs->casefold != ovl_dentry_casefolded(newdentry)) {
+ pr_warn_ratelimited("wrong inherited casefold (%pd2)\n",
+ newdentry);
+ dput(newdentry);
+ err = -EINVAL;
+ }
break;
case S_IFCHR:
@@ -205,12 +212,32 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct dentry *parent,
err = -EPERM;
}
}
- if (!err && WARN_ON(!newdentry->d_inode)) {
+ if (err)
+ goto out;
+
+ if (WARN_ON(!newdentry->d_inode)) {
/*
* Not quite sure if non-instantiated dentry is legal or not.
* VFS doesn't seem to care so check and warn here.
*/
err = -EIO;
+ } else if (d_unhashed(newdentry)) {
+ struct dentry *d;
+ /*
+ * Some filesystems (i.e. casefolded) may return an unhashed
+ * negative dentry from the ovl_lookup_upper() call before
+ * ovl_create_real().
+ * In that case, lookup again after making the newdentry
+ * positive, so ovl_create_upper() always returns a hashed
+ * positive dentry.
+ */
+ d = ovl_lookup_upper(ofs, newdentry->d_name.name, parent,
+ newdentry->d_name.len);
+ dput(newdentry);
+ if (IS_ERR_OR_NULL(d))
+ err = d ? PTR_ERR(d) : -ENOENT;
+ else
+ return d;
}
out:
if (err) {
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index f5b8877d5fe2..fc52c796061d 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -120,7 +120,7 @@ static bool ovl_is_real_file(const struct file *realfile,
}
static struct file *ovl_real_file_path(const struct file *file,
- struct path *realpath)
+ const struct path *realpath)
{
struct ovl_file *of = file->private_data;
struct file *realfile = of->realfile;
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index ecb9f2019395..aaa4cf579561 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -1277,6 +1277,7 @@ struct inode *ovl_get_inode(struct super_block *sb,
}
ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev);
ovl_inode_init(inode, oip, ino, fsid);
+ WARN_ON_ONCE(!!IS_CASEFOLDED(inode) != ofs->casefold);
if (upperdentry && ovl_is_impuredir(sb, upperdentry))
ovl_set_flag(OVL_IMPURE, inode);
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 76d6248b625e..e93bcc5727bc 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -239,13 +239,14 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
char val;
/*
- * We allow filesystems that are case-folding capable but deny composing
- * ovl stack from case-folded directories. If someone has enabled case
- * folding on a directory on underlying layer, the warranty of the ovl
- * stack is voided.
+ * We allow filesystems that are case-folding capable as long as the
+ * layers are consistently enabled in the stack, enabled for every dir
+ * or disabled in all dirs. If someone has modified case folding on a
+ * directory on underlying layer, the warranty of the ovl stack is
+ * voided.
*/
- if (ovl_dentry_casefolded(base)) {
- warn = "case folded parent";
+ if (ofs->casefold != ovl_dentry_casefolded(base)) {
+ warn = "parent wrong casefold";
err = -ESTALE;
goto out_warn;
}
@@ -259,8 +260,8 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
goto out_err;
}
- if (ovl_dentry_casefolded(this)) {
- warn = "case folded child";
+ if (ofs->casefold != ovl_dentry_casefolded(this)) {
+ warn = "child wrong casefold";
err = -EREMOTE;
goto out_warn;
}
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 4f84abaa0d68..c8fd5951fc5e 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -562,11 +562,11 @@ int ovl_set_metacopy_xattr(struct ovl_fs *ofs, struct dentry *d,
struct ovl_metacopy *metacopy);
bool ovl_is_metacopy_dentry(struct dentry *dentry);
char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding);
-int ovl_ensure_verity_loaded(struct path *path);
+int ovl_ensure_verity_loaded(const struct path *path);
int ovl_validate_verity(struct ovl_fs *ofs,
- struct path *metapath,
- struct path *datapath);
-int ovl_get_verity_digest(struct ovl_fs *ofs, struct path *src,
+ const struct path *metapath,
+ const struct path *datapath);
+int ovl_get_verity_digest(struct ovl_fs *ofs, const struct path *src,
struct ovl_metacopy *metacopy);
int ovl_sync_status(struct ovl_fs *ofs);
@@ -820,10 +820,12 @@ struct inode *ovl_get_inode(struct super_block *sb,
struct ovl_inode_params *oip);
void ovl_copyattr(struct inode *to);
+/* vfs fileattr flags read from overlay.protattr xattr to ovl inode */
+#define OVL_PROT_I_FLAGS_MASK (S_APPEND | S_IMMUTABLE)
+/* vfs fileattr flags copied from real to ovl inode */
+#define OVL_FATTR_I_FLAGS_MASK (OVL_PROT_I_FLAGS_MASK | S_SYNC | S_NOATIME)
/* vfs inode flags copied from real to ovl inode */
-#define OVL_COPY_I_FLAGS_MASK (S_SYNC | S_NOATIME | S_APPEND | S_IMMUTABLE)
-/* vfs inode flags read from overlay.protattr xattr to ovl inode */
-#define OVL_PROT_I_FLAGS_MASK (S_APPEND | S_IMMUTABLE)
+#define OVL_COPY_I_FLAGS_MASK (OVL_FATTR_I_FLAGS_MASK | S_CASEFOLD)
/*
* fileattr flags copied from lower to upper inode on copy up.
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index 4c1bae935ced..1d4828dbcf7a 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -91,6 +91,7 @@ struct ovl_fs {
struct mutex whiteout_lock;
/* r/o snapshot of upperdir sb's only taken on volatile mounts */
errseq_t errseq;
+ bool casefold;
};
/* Number of lower layers, not including data-only layers */
diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index f4e7fff909ac..63b7346c5ee1 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -276,17 +276,26 @@ static int ovl_mount_dir(const char *name, struct path *path)
static int ovl_mount_dir_check(struct fs_context *fc, const struct path *path,
enum ovl_opt layer, const char *name, bool upper)
{
+ bool is_casefolded = ovl_dentry_casefolded(path->dentry);
struct ovl_fs_context *ctx = fc->fs_private;
+ struct ovl_fs *ofs = fc->s_fs_info;
if (!d_is_dir(path->dentry))
return invalfc(fc, "%s is not a directory", name);
/*
* Allow filesystems that are case-folding capable but deny composing
- * ovl stack from case-folded directories.
+ * ovl stack from inconsistent case-folded directories.
*/
- if (ovl_dentry_casefolded(path->dentry))
- return invalfc(fc, "case-insensitive directory on %s not supported", name);
+ if (!ctx->casefold_set) {
+ ofs->casefold = is_casefolded;
+ ctx->casefold_set = true;
+ }
+
+ if (ofs->casefold != is_casefolded) {
+ return invalfc(fc, "case-%ssensitive directory on %s is inconsistent",
+ is_casefolded ? "in" : "", name);
+ }
if (ovl_dentry_weird(path->dentry))
return invalfc(fc, "filesystem on %s not supported", name);
diff --git a/fs/overlayfs/params.h b/fs/overlayfs/params.h
index c96d93982021..ffd53cdd8482 100644
--- a/fs/overlayfs/params.h
+++ b/fs/overlayfs/params.h
@@ -33,6 +33,7 @@ struct ovl_fs_context {
struct ovl_opt_set set;
struct ovl_fs_context_layer *lower;
char *lowerdir_all; /* user provided lowerdir string */
+ bool casefold_set;
};
int ovl_init_fs_context(struct fs_context *fc);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 15cb06fa0c9a..1e9792cc557b 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -27,6 +27,8 @@ struct ovl_cache_entry {
bool is_upper;
bool is_whiteout;
bool check_xwhiteout;
+ const char *c_name;
+ int c_len;
char name[];
};
@@ -45,6 +47,7 @@ struct ovl_readdir_data {
struct list_head *list;
struct list_head middle;
struct ovl_cache_entry *first_maybe_whiteout;
+ struct unicode_map *map;
int count;
int err;
bool is_upper;
@@ -66,6 +69,31 @@ static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
return rb_entry(n, struct ovl_cache_entry, node);
}
+static int ovl_casefold(struct ovl_readdir_data *rdd, const char *str, int len,
+ char **dst)
+{
+ const struct qstr qstr = { .name = str, .len = len };
+ char *cf_name;
+ int cf_len;
+
+ if (!IS_ENABLED(CONFIG_UNICODE) || !rdd->map || is_dot_dotdot(str, len))
+ return 0;
+
+ cf_name = kmalloc(NAME_MAX, GFP_KERNEL);
+ if (!cf_name) {
+ rdd->err = -ENOMEM;
+ return -ENOMEM;
+ }
+
+ cf_len = utf8_casefold(rdd->map, &qstr, cf_name, NAME_MAX);
+ if (cf_len > 0)
+ *dst = cf_name;
+ else
+ kfree(cf_name);
+
+ return cf_len;
+}
+
static bool ovl_cache_entry_find_link(const char *name, int len,
struct rb_node ***link,
struct rb_node **parent)
@@ -79,10 +107,10 @@ static bool ovl_cache_entry_find_link(const char *name, int len,
*parent = *newp;
tmp = ovl_cache_entry_from_node(*newp);
- cmp = strncmp(name, tmp->name, len);
+ cmp = strncmp(name, tmp->c_name, len);
if (cmp > 0)
newp = &tmp->node.rb_right;
- else if (cmp < 0 || len < tmp->len)
+ else if (cmp < 0 || len < tmp->c_len)
newp = &tmp->node.rb_left;
else
found = true;
@@ -101,10 +129,10 @@ static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
while (node) {
struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
- cmp = strncmp(name, p->name, len);
+ cmp = strncmp(name, p->c_name, len);
if (cmp > 0)
node = p->node.rb_right;
- else if (cmp < 0 || len < p->len)
+ else if (cmp < 0 || len < p->c_len)
node = p->node.rb_left;
else
return p;
@@ -145,6 +173,7 @@ static bool ovl_calc_d_ino(struct ovl_readdir_data *rdd,
static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
const char *name, int len,
+ const char *c_name, int c_len,
u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
@@ -167,6 +196,14 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
/* Defer check for overlay.whiteout to ovl_iterate() */
p->check_xwhiteout = rdd->in_xwhiteouts_dir && d_type == DT_REG;
+ if (c_name && c_name != name) {
+ p->c_name = c_name;
+ p->c_len = c_len;
+ } else {
+ p->c_name = p->name;
+ p->c_len = len;
+ }
+
if (d_type == DT_CHR) {
p->next_maybe_whiteout = rdd->first_maybe_whiteout;
rdd->first_maybe_whiteout = p;
@@ -174,48 +211,62 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
return p;
}
-static bool ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
- const char *name, int len, u64 ino,
+/* Return 0 for found, 1 for added, <0 for error */
+static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
+ const char *name, int len,
+ const char *c_name, int c_len,
+ u64 ino,
unsigned int d_type)
{
struct rb_node **newp = &rdd->root->rb_node;
struct rb_node *parent = NULL;
struct ovl_cache_entry *p;
- if (ovl_cache_entry_find_link(name, len, &newp, &parent))
- return true;
+ if (ovl_cache_entry_find_link(c_name, c_len, &newp, &parent))
+ return 0;
- p = ovl_cache_entry_new(rdd, name, len, ino, d_type);
+ p = ovl_cache_entry_new(rdd, name, len, c_name, c_len, ino, d_type);
if (p == NULL) {
rdd->err = -ENOMEM;
- return false;
+ return -ENOMEM;
}
list_add_tail(&p->l_node, rdd->list);
rb_link_node(&p->node, parent, newp);
rb_insert_color(&p->node, rdd->root);
- return true;
+ return 1;
}
-static bool ovl_fill_lowest(struct ovl_readdir_data *rdd,
+/* Return 0 for found, 1 for added, <0 for error */
+static int ovl_fill_lowest(struct ovl_readdir_data *rdd,
const char *name, int namelen,
+ const char *c_name, int c_len,
loff_t offset, u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
- p = ovl_cache_entry_find(rdd->root, name, namelen);
+ p = ovl_cache_entry_find(rdd->root, c_name, c_len);
if (p) {
list_move_tail(&p->l_node, &rdd->middle);
+ return 0;
} else {
- p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
+ p = ovl_cache_entry_new(rdd, name, namelen, c_name, c_len,
+ ino, d_type);
if (p == NULL)
rdd->err = -ENOMEM;
else
list_add_tail(&p->l_node, &rdd->middle);
}
- return rdd->err == 0;
+ return rdd->err ?: 1;
+}
+
+static void ovl_cache_entry_free(struct ovl_cache_entry *p)
+{
+ if (p->c_name != p->name)
+ kfree(p->c_name);
+ kfree(p);
}
void ovl_cache_free(struct list_head *list)
@@ -224,7 +275,7 @@ void ovl_cache_free(struct list_head *list)
struct ovl_cache_entry *n;
list_for_each_entry_safe(p, n, list, l_node)
- kfree(p);
+ ovl_cache_entry_free(p);
INIT_LIST_HEAD(list);
}
@@ -260,12 +311,39 @@ static bool ovl_fill_merge(struct dir_context *ctx, const char *name,
{
struct ovl_readdir_data *rdd =
container_of(ctx, struct ovl_readdir_data, ctx);
+ struct ovl_fs *ofs = OVL_FS(rdd->dentry->d_sb);
+ const char *c_name = NULL;
+ char *cf_name = NULL;
+ int c_len = 0, ret;
+
+ if (ofs->casefold)
+ c_len = ovl_casefold(rdd, name, namelen, &cf_name);
+
+ if (rdd->err)
+ return false;
+
+ if (c_len <= 0) {
+ c_name = name;
+ c_len = namelen;
+ } else {
+ c_name = cf_name;
+ }
rdd->count++;
if (!rdd->is_lowest)
- return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
+ ret = ovl_cache_entry_add_rb(rdd, name, namelen, c_name, c_len, ino, d_type);
else
- return ovl_fill_lowest(rdd, name, namelen, offset, ino, d_type);
+ ret = ovl_fill_lowest(rdd, name, namelen, c_name, c_len, offset, ino, d_type);
+
+ /*
+ * If ret == 1, that means that c_name is being used as part of struct
+ * ovl_cache_entry and will be freed at ovl_cache_free(). Otherwise,
+ * c_name was found in the rb-tree so we can free it here.
+ */
+ if (ret != 1 && c_name != name)
+ kfree(c_name);
+
+ return ret >= 0;
}
static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd)
@@ -357,12 +435,18 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list,
.list = list,
.root = root,
.is_lowest = false,
+ .map = NULL,
};
int idx, next;
const struct ovl_layer *layer;
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
for (idx = 0; idx != -1; idx = next) {
next = ovl_path_next(idx, dentry, &realpath, &layer);
+
+ if (ofs->casefold)
+ rdd.map = sb_encoding(realpath.dentry->d_sb);
+
rdd.is_upper = ovl_dentry_upper(dentry) == realpath.dentry;
rdd.in_xwhiteouts_dir = layer->has_xwhiteouts &&
ovl_dentry_has_xwhiteouts(dentry);
@@ -555,7 +639,7 @@ static bool ovl_fill_plain(struct dir_context *ctx, const char *name,
container_of(ctx, struct ovl_readdir_data, ctx);
rdd->count++;
- p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
+ p = ovl_cache_entry_new(rdd, name, namelen, NULL, 0, ino, d_type);
if (p == NULL) {
rdd->err = -ENOMEM;
return false;
@@ -595,7 +679,7 @@ static int ovl_dir_read_impure(const struct path *path, struct list_head *list,
}
if (p->ino == p->real_ino) {
list_del(&p->l_node);
- kfree(p);
+ ovl_cache_entry_free(p);
} else {
struct rb_node **newp = &root->rb_node;
struct rb_node *parent = NULL;
@@ -1023,7 +1107,7 @@ int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
del_entry:
list_del(&p->l_node);
- kfree(p);
+ ovl_cache_entry_free(p);
}
return err;
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index bd3d7ba8fb95..43ee4c7296a7 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -161,6 +161,16 @@ static const struct dentry_operations ovl_dentry_operations = {
.d_weak_revalidate = ovl_dentry_weak_revalidate,
};
+#if IS_ENABLED(CONFIG_UNICODE)
+static const struct dentry_operations ovl_dentry_ci_operations = {
+ .d_real = ovl_d_real,
+ .d_revalidate = ovl_dentry_revalidate,
+ .d_weak_revalidate = ovl_dentry_weak_revalidate,
+ .d_hash = generic_ci_d_hash,
+ .d_compare = generic_ci_d_compare,
+};
+#endif
+
static struct kmem_cache *ovl_inode_cachep;
static struct inode *ovl_alloc_inode(struct super_block *sb)
@@ -394,7 +404,7 @@ static int ovl_check_namelen(const struct path *path, struct ovl_fs *ofs,
return err;
}
-static int ovl_lower_dir(const char *name, struct path *path,
+static int ovl_lower_dir(const char *name, const struct path *path,
struct ovl_fs *ofs, int *stack_depth)
{
int fh_type;
@@ -991,6 +1001,25 @@ static int ovl_get_data_fsid(struct ovl_fs *ofs)
return ofs->numfs;
}
+/*
+ * Set the ovl sb encoding as the same one used by the first layer
+ */
+static int ovl_set_encoding(struct super_block *sb, struct super_block *fs_sb)
+{
+ if (!sb_has_encoding(fs_sb))
+ return 0;
+
+#if IS_ENABLED(CONFIG_UNICODE)
+ if (sb_has_strict_encoding(fs_sb)) {
+ pr_err("strict encoding not supported\n");
+ return -EINVAL;
+ }
+
+ sb->s_encoding = fs_sb->s_encoding;
+ sb->s_encoding_flags = fs_sb->s_encoding_flags;
+#endif
+ return 0;
+}
static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs,
struct ovl_fs_context *ctx, struct ovl_layer *layers)
@@ -1024,6 +1053,12 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs,
if (ovl_upper_mnt(ofs)) {
ofs->fs[0].sb = ovl_upper_mnt(ofs)->mnt_sb;
ofs->fs[0].is_lower = false;
+
+ if (ofs->casefold) {
+ err = ovl_set_encoding(sb, ofs->fs[0].sb);
+ if (err)
+ return err;
+ }
}
nr_merged_lower = ctx->nr - ctx->nr_data;
@@ -1083,6 +1118,19 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs,
l->name = NULL;
ofs->numlayer++;
ofs->fs[fsid].is_lower = true;
+
+ if (ofs->casefold) {
+ if (!ovl_upper_mnt(ofs) && !sb_has_encoding(sb)) {
+ err = ovl_set_encoding(sb, ofs->fs[fsid].sb);
+ if (err)
+ return err;
+ }
+
+ if (!sb_same_encoding(sb, mnt->mnt_sb)) {
+ pr_err("all layers must have the same encoding\n");
+ return -EINVAL;
+ }
+ }
}
/*
@@ -1300,6 +1348,7 @@ static struct dentry *ovl_get_root(struct super_block *sb,
ovl_dentry_set_flag(OVL_E_CONNECTED, root);
ovl_set_upperdata(d_inode(root));
ovl_inode_init(d_inode(root), &oip, ino, fsid);
+ WARN_ON(!!IS_CASEFOLDED(d_inode(root)) != ofs->casefold);
ovl_dentry_init_flags(root, upperdentry, oe, DCACHE_OP_WEAK_REVALIDATE);
/* root keeps a reference of upperdentry */
dget(upperdentry);
@@ -1307,6 +1356,19 @@ static struct dentry *ovl_get_root(struct super_block *sb,
return root;
}
+static void ovl_set_d_op(struct super_block *sb)
+{
+#if IS_ENABLED(CONFIG_UNICODE)
+ struct ovl_fs *ofs = sb->s_fs_info;
+
+ if (ofs->casefold) {
+ set_default_d_op(sb, &ovl_dentry_ci_operations);
+ return;
+ }
+#endif
+ set_default_d_op(sb, &ovl_dentry_operations);
+}
+
int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct ovl_fs *ofs = sb->s_fs_info;
@@ -1322,7 +1384,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
if (WARN_ON(fc->user_ns != current_user_ns()))
goto out_err;
- set_default_d_op(sb, &ovl_dentry_operations);
+ ovl_set_d_op(sb);
err = -ENOMEM;
if (!ofs->creator_cred)
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 41033bac96cb..f76672f2e686 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -210,11 +210,11 @@ bool ovl_dentry_weird(struct dentry *dentry)
return true;
/*
- * Allow filesystems that are case-folding capable but deny composing
- * ovl stack from case-folded directories.
+ * Exceptionally for layers with casefold, we accept that they have
+ * their own hash and compare operations
*/
if (sb_has_encoding(dentry->d_sb))
- return IS_CASEFOLDED(d_inode(dentry));
+ return false;
return dentry->d_flags & (DCACHE_OP_HASH | DCACHE_OP_COMPARE);
}
@@ -1381,7 +1381,7 @@ err_free:
}
/* Call with mounter creds as it may open the file */
-int ovl_ensure_verity_loaded(struct path *datapath)
+int ovl_ensure_verity_loaded(const struct path *datapath)
{
struct inode *inode = d_inode(datapath->dentry);
struct file *filp;
@@ -1401,8 +1401,8 @@ int ovl_ensure_verity_loaded(struct path *datapath)
}
int ovl_validate_verity(struct ovl_fs *ofs,
- struct path *metapath,
- struct path *datapath)
+ const struct path *metapath,
+ const struct path *datapath)
{
struct ovl_metacopy metacopy_data;
u8 actual_digest[FS_VERITY_MAX_DIGEST_SIZE];
@@ -1455,7 +1455,7 @@ int ovl_validate_verity(struct ovl_fs *ofs,
return 0;
}
-int ovl_get_verity_digest(struct ovl_fs *ofs, struct path *src,
+int ovl_get_verity_digest(struct ovl_fs *ofs, const struct path *src,
struct ovl_metacopy *metacopy)
{
int err, digest_size;
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 44a95cd27377..0ef5b47d796a 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -850,7 +850,7 @@ static int pidfs_export_permission(struct handle_to_path_ctx *ctx,
return 0;
}
-static struct file *pidfs_export_open(struct path *path, unsigned int oflags)
+static struct file *pidfs_export_open(const struct path *path, unsigned int oflags)
{
/*
* Clear O_LARGEFILE as open_by_handle_at() forces it and raise
diff --git a/fs/pnode.c b/fs/pnode.c
index 6f7d02f3fa98..5d91c3e58d2a 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -29,6 +29,7 @@ static inline struct mount *next_slave(struct mount *p)
return hlist_entry(p->mnt_slave.next, struct mount, mnt_slave);
}
+/* locks: namespace_shared && is_mounted(mnt) */
static struct mount *get_peer_under_root(struct mount *mnt,
struct mnt_namespace *ns,
const struct path *root)
@@ -50,7 +51,7 @@ static struct mount *get_peer_under_root(struct mount *mnt,
* Get ID of closest dominating peer group having a representative
* under the given root.
*
- * Caller must hold namespace_sem
+ * locks: namespace_shared
*/
int get_dominating_id(struct mount *mnt, const struct path *root)
{
@@ -70,19 +71,6 @@ static inline bool will_be_unmounted(struct mount *m)
return m->mnt.mnt_flags & MNT_UMOUNT;
}
-static struct mount *propagation_source(struct mount *mnt)
-{
- do {
- struct mount *m;
- for (m = next_peer(mnt); m != mnt; m = next_peer(m)) {
- if (!will_be_unmounted(m))
- return m;
- }
- mnt = mnt->mnt_master;
- } while (mnt && will_be_unmounted(mnt));
- return mnt;
-}
-
static void transfer_propagation(struct mount *mnt, struct mount *to)
{
struct hlist_node *p = NULL, *n;
@@ -111,11 +99,10 @@ void change_mnt_propagation(struct mount *mnt, int type)
return;
}
if (IS_MNT_SHARED(mnt)) {
- if (type == MS_SLAVE || !hlist_empty(&mnt->mnt_slave_list))
- m = propagation_source(mnt);
if (list_empty(&mnt->mnt_share)) {
mnt_release_group_id(mnt);
} else {
+ m = next_peer(mnt);
list_del_init(&mnt->mnt_share);
mnt->mnt_group_id = 0;
}
@@ -136,6 +123,57 @@ void change_mnt_propagation(struct mount *mnt, int type)
}
}
+static struct mount *trace_transfers(struct mount *m)
+{
+ while (1) {
+ struct mount *next = next_peer(m);
+
+ if (next != m) {
+ list_del_init(&m->mnt_share);
+ m->mnt_group_id = 0;
+ m->mnt_master = next;
+ } else {
+ if (IS_MNT_SHARED(m))
+ mnt_release_group_id(m);
+ next = m->mnt_master;
+ }
+ hlist_del_init(&m->mnt_slave);
+ CLEAR_MNT_SHARED(m);
+ SET_MNT_MARK(m);
+
+ if (!next || !will_be_unmounted(next))
+ return next;
+ if (IS_MNT_MARKED(next))
+ return next->mnt_master;
+ m = next;
+ }
+}
+
+static void set_destinations(struct mount *m, struct mount *master)
+{
+ struct mount *next;
+
+ while ((next = m->mnt_master) != master) {
+ m->mnt_master = master;
+ m = next;
+ }
+}
+
+void bulk_make_private(struct list_head *set)
+{
+ struct mount *m;
+
+ list_for_each_entry(m, set, mnt_list)
+ if (!IS_MNT_MARKED(m))
+ set_destinations(m, trace_transfers(m));
+
+ list_for_each_entry(m, set, mnt_list) {
+ transfer_propagation(m, m->mnt_master);
+ m->mnt_master = NULL;
+ CLEAR_MNT_MARK(m);
+ }
+}
+
static struct mount *__propagation_next(struct mount *m,
struct mount *origin)
{
@@ -304,9 +342,8 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
err = PTR_ERR(this);
break;
}
- read_seqlock_excl(&mount_lock);
- mnt_set_mountpoint(n, dest_mp, this);
- read_sequnlock_excl(&mount_lock);
+ scoped_guard(mount_locked_reader)
+ mnt_set_mountpoint(n, dest_mp, this);
if (n->mnt_master)
SET_MNT_MARK(n->mnt_master);
copy = this;
diff --git a/fs/pnode.h b/fs/pnode.h
index 00ab153e3e9d..b029db225f33 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -42,6 +42,7 @@ static inline bool peers(const struct mount *m1, const struct mount *m2)
}
void change_mnt_propagation(struct mount *, int);
+void bulk_make_private(struct list_head *);
int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
struct hlist_head *);
void propagate_umount(struct list_head *);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b997ceef9135..6299878e3d97 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3947,7 +3947,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
tid = task_pid_nr_ns(task, ns);
if (!tid)
continue; /* The task has just exited. */
- len = snprintf(name, sizeof(name), "%u", tid);
+ len = snprintf(name, sizeof(name), "%d", tid);
if (!proc_fill_cache(file, ctx, name, len,
proc_task_instantiate, task, NULL)) {
/* returning this tgid failed, save it as the first
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index afa15a214538..6c4a6ee1fa2b 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -162,6 +162,9 @@ static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES;
/* SLAB cache for dquot structures */
static struct kmem_cache *dquot_cachep;
+/* workqueue for work quota_release_work*/
+static struct workqueue_struct *quota_unbound_wq;
+
void register_quota_format(struct quota_format_type *fmt)
{
spin_lock(&dq_list_lock);
@@ -881,7 +884,7 @@ void dqput(struct dquot *dquot)
put_releasing_dquots(dquot);
atomic_dec(&dquot->dq_count);
spin_unlock(&dq_list_lock);
- queue_delayed_work(system_dfl_wq, &quota_release_work, 1);
+ queue_delayed_work(quota_unbound_wq, &quota_release_work, 1);
}
EXPORT_SYMBOL(dqput);
@@ -3041,6 +3044,11 @@ static int __init dquot_init(void)
shrinker_register(dqcache_shrinker);
+ quota_unbound_wq = alloc_workqueue("quota_events_unbound",
+ WQ_UNBOUND | WQ_MEM_RECLAIM, WQ_MAX_ACTIVE);
+ if (!quota_unbound_wq)
+ panic("Cannot create quota_unbound_wq\n");
+
return 0;
}
fs_initcall(dquot_init);
diff --git a/fs/smb/client/Kconfig b/fs/smb/client/Kconfig
index 9f05f94e265a..a4c02199fef4 100644
--- a/fs/smb/client/Kconfig
+++ b/fs/smb/client/Kconfig
@@ -15,6 +15,7 @@ config CIFS
select CRYPTO_GCM
select CRYPTO_ECB
select CRYPTO_AES
+ select CRYPTO_LIB_ARC4
select KEYS
select DNS_RESOLVER
select ASN1
diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c
index b69daeb1301b..b36f9f9340f0 100644
--- a/fs/smb/client/cached_dir.c
+++ b/fs/smb/client/cached_dir.c
@@ -36,9 +36,8 @@ static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids,
* fully cached or it may be in the process of
* being deleted due to a lease break.
*/
- if (!cfid->time || !cfid->has_lease) {
+ if (!is_valid_cached_dir(cfid))
return NULL;
- }
kref_get(&cfid->refcount);
return cfid;
}
@@ -194,7 +193,7 @@ replay_again:
* Otherwise, it is either a new entry or laundromat worker removed it
* from @cfids->entries. Caller will put last reference if the latter.
*/
- if (cfid->has_lease && cfid->time) {
+ if (is_valid_cached_dir(cfid)) {
cfid->last_access_time = jiffies;
spin_unlock(&cfids->cfid_list_lock);
*ret_cfid = cfid;
@@ -233,7 +232,7 @@ replay_again:
list_for_each_entry(parent_cfid, &cfids->entries, entry) {
if (parent_cfid->dentry == dentry->d_parent) {
cifs_dbg(FYI, "found a parent cached file handle\n");
- if (parent_cfid->has_lease && parent_cfid->time) {
+ if (is_valid_cached_dir(parent_cfid)) {
lease_flags
|= SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE;
memcpy(pfid->parent_lease_key,
@@ -417,12 +416,18 @@ int open_cached_dir_by_dentry(struct cifs_tcon *tcon,
if (cfids == NULL)
return -EOPNOTSUPP;
+ if (!dentry)
+ return -ENOENT;
+
spin_lock(&cfids->cfid_list_lock);
list_for_each_entry(cfid, &cfids->entries, entry) {
- if (dentry && cfid->dentry == dentry) {
+ if (cfid->dentry == dentry) {
+ if (!is_valid_cached_dir(cfid))
+ break;
cifs_dbg(FYI, "found a cached file handle by dentry\n");
kref_get(&cfid->refcount);
*ret_cfid = cfid;
+ cfid->last_access_time = jiffies;
spin_unlock(&cfids->cfid_list_lock);
return 0;
}
@@ -522,10 +527,9 @@ void close_all_cached_dirs(struct cifs_sb_info *cifs_sb)
spin_unlock(&cifs_sb->tlink_tree_lock);
goto done;
}
- spin_lock(&cfid->fid_lock);
+
tmp_list->dentry = cfid->dentry;
cfid->dentry = NULL;
- spin_unlock(&cfid->fid_lock);
list_add_tail(&tmp_list->entry, &entry);
}
@@ -608,14 +612,9 @@ static void cached_dir_put_work(struct work_struct *work)
{
struct cached_fid *cfid = container_of(work, struct cached_fid,
put_work);
- struct dentry *dentry;
-
- spin_lock(&cfid->fid_lock);
- dentry = cfid->dentry;
+ dput(cfid->dentry);
cfid->dentry = NULL;
- spin_unlock(&cfid->fid_lock);
- dput(dentry);
queue_work(serverclose_wq, &cfid->close_work);
}
@@ -673,7 +672,6 @@ static struct cached_fid *init_cached_dir(const char *path)
INIT_LIST_HEAD(&cfid->entry);
INIT_LIST_HEAD(&cfid->dirents.entries);
mutex_init(&cfid->dirents.de_mutex);
- spin_lock_init(&cfid->fid_lock);
kref_init(&cfid->refcount);
return cfid;
}
@@ -697,6 +695,21 @@ static void free_cached_dir(struct cached_fid *cfid)
kfree(dirent);
}
+ /* adjust tcon-level counters and reset per-dir accounting */
+ if (cfid->cfids) {
+ if (cfid->dirents.entries_count)
+ atomic_long_sub((long)cfid->dirents.entries_count,
+ &cfid->cfids->total_dirents_entries);
+ if (cfid->dirents.bytes_used) {
+ atomic64_sub((long long)cfid->dirents.bytes_used,
+ &cfid->cfids->total_dirents_bytes);
+ atomic64_sub((long long)cfid->dirents.bytes_used,
+ &cifs_dircache_bytes_used);
+ }
+ }
+ cfid->dirents.entries_count = 0;
+ cfid->dirents.bytes_used = 0;
+
kfree(cfid->path);
cfid->path = NULL;
kfree(cfid);
@@ -725,7 +738,6 @@ static void cfids_laundromat_worker(struct work_struct *work)
{
struct cached_fids *cfids;
struct cached_fid *cfid, *q;
- struct dentry *dentry;
LIST_HEAD(entry);
cfids = container_of(work, struct cached_fids, laundromat_work.work);
@@ -752,12 +764,9 @@ static void cfids_laundromat_worker(struct work_struct *work)
list_for_each_entry_safe(cfid, q, &entry, entry) {
list_del(&cfid->entry);
- spin_lock(&cfid->fid_lock);
- dentry = cfid->dentry;
+ dput(cfid->dentry);
cfid->dentry = NULL;
- spin_unlock(&cfid->fid_lock);
- dput(dentry);
if (cfid->is_open) {
spin_lock(&cifs_tcp_ses_lock);
++cfid->tcon->tc_count;
@@ -792,6 +801,9 @@ struct cached_fids *init_cached_dirs(void)
queue_delayed_work(cfid_put_wq, &cfids->laundromat_work,
dir_cache_timeout * HZ);
+ atomic_long_set(&cfids->total_dirents_entries, 0);
+ atomic64_set(&cfids->total_dirents_bytes, 0);
+
return cfids;
}
diff --git a/fs/smb/client/cached_dir.h b/fs/smb/client/cached_dir.h
index 46b5a2fdf15b..31339dc32719 100644
--- a/fs/smb/client/cached_dir.h
+++ b/fs/smb/client/cached_dir.h
@@ -27,6 +27,9 @@ struct cached_dirents {
struct mutex de_mutex;
loff_t pos; /* Expected ctx->pos */
struct list_head entries;
+ /* accounting for cached entries in this directory */
+ unsigned long entries_count;
+ unsigned long bytes_used;
};
struct cached_fid {
@@ -41,7 +44,6 @@ struct cached_fid {
unsigned long last_access_time; /* jiffies of when last accessed */
struct kref refcount;
struct cifs_fid fid;
- spinlock_t fid_lock;
struct cifs_tcon *tcon;
struct dentry *dentry;
struct work_struct put_work;
@@ -62,8 +64,20 @@ struct cached_fids {
struct list_head dying;
struct work_struct invalidation_work;
struct delayed_work laundromat_work;
+ /* aggregate accounting for all cached dirents under this tcon */
+ atomic_long_t total_dirents_entries;
+ atomic64_t total_dirents_bytes;
};
+/* Module-wide directory cache accounting (defined in cifsfs.c) */
+extern atomic64_t cifs_dircache_bytes_used; /* bytes across all mounts */
+
+static inline bool
+is_valid_cached_dir(struct cached_fid *cfid)
+{
+ return cfid->time && cfid->has_lease;
+}
+
extern struct cached_fids *init_cached_dirs(void);
extern void free_cached_dirs(struct cached_fids *cfids);
extern int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c
index 35c4d27d2cc0..1fb71d2d31b5 100644
--- a/fs/smb/client/cifs_debug.c
+++ b/fs/smb/client/cifs_debug.c
@@ -240,14 +240,18 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v)
struct cifs_ses *ses;
struct cifs_tcon *tcon;
struct cifsFileInfo *cfile;
+ struct inode *inode;
+ struct cifsInodeInfo *cinode;
+ char lease[4];
+ int n;
seq_puts(m, "# Version:1\n");
seq_puts(m, "# Format:\n");
seq_puts(m, "# <tree id> <ses id> <persistent fid> <flags> <count> <pid> <uid>");
#ifdef CONFIG_CIFS_DEBUG2
- seq_printf(m, " <filename> <mid>\n");
+ seq_puts(m, " <filename> <lease> <mid>\n");
#else
- seq_printf(m, " <filename>\n");
+ seq_puts(m, " <filename> <lease>\n");
#endif /* CIFS_DEBUG2 */
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
@@ -267,11 +271,30 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v)
cfile->pid,
from_kuid(&init_user_ns, cfile->uid),
cfile->dentry);
+
+ /* Append lease/oplock caching state as RHW letters */
+ inode = d_inode(cfile->dentry);
+ n = 0;
+ if (inode) {
+ cinode = CIFS_I(inode);
+ if (CIFS_CACHE_READ(cinode))
+ lease[n++] = 'R';
+ if (CIFS_CACHE_HANDLE(cinode))
+ lease[n++] = 'H';
+ if (CIFS_CACHE_WRITE(cinode))
+ lease[n++] = 'W';
+ }
+ lease[n] = '\0';
+ seq_puts(m, " ");
+ if (n)
+ seq_printf(m, "%s", lease);
+ else
+ seq_puts(m, "NONE");
+
#ifdef CONFIG_CIFS_DEBUG2
- seq_printf(m, " %llu\n", cfile->fid.mid);
-#else
+ seq_printf(m, " %llu", cfile->fid.mid);
+#endif /* CONFIG_CIFS_DEBUG2 */
seq_printf(m, "\n");
-#endif /* CIFS_DEBUG2 */
}
spin_unlock(&tcon->open_file_lock);
}
@@ -308,7 +331,10 @@ static int cifs_debug_dirs_proc_show(struct seq_file *m, void *v)
if (!cfids)
continue;
spin_lock(&cfids->cfid_list_lock); /* check lock ordering */
- seq_printf(m, "Num entries: %d\n", cfids->num_entries);
+ seq_printf(m, "Num entries: %d, cached_dirents: %lu entries, %llu bytes\n",
+ cfids->num_entries,
+ (unsigned long)atomic_long_read(&cfids->total_dirents_entries),
+ (unsigned long long)atomic64_read(&cfids->total_dirents_bytes));
list_for_each_entry(cfid, &cfids->entries, entry) {
seq_printf(m, "0x%x 0x%llx 0x%llx %s",
tcon->tid,
@@ -319,6 +345,9 @@ static int cifs_debug_dirs_proc_show(struct seq_file *m, void *v)
seq_printf(m, "\tvalid file info");
if (cfid->dirents.is_valid)
seq_printf(m, ", valid dirents");
+ if (!list_empty(&cfid->dirents.entries))
+ seq_printf(m, ", dirents: %lu entries, %lu bytes",
+ cfid->dirents.entries_count, cfid->dirents.bytes_used);
seq_printf(m, "\n");
}
spin_unlock(&cfids->cfid_list_lock);
diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c
index 3cc686246908..7b7c8c38fdd0 100644
--- a/fs/smb/client/cifsencrypt.c
+++ b/fs/smb/client/cifsencrypt.c
@@ -22,8 +22,8 @@
#include <linux/highmem.h>
#include <linux/fips.h>
#include <linux/iov_iter.h>
-#include "../common/arc4.h"
#include <crypto/aead.h>
+#include <crypto/arc4.h>
static size_t cifs_shash_step(void *iter_base, size_t progress, size_t len,
void *priv, void *priv2)
@@ -725,9 +725,9 @@ calc_seckey(struct cifs_ses *ses)
return -ENOMEM;
}
- cifs_arc4_setkey(ctx_arc4, ses->auth_key.response, CIFS_SESS_KEY_SIZE);
- cifs_arc4_crypt(ctx_arc4, ses->ntlmssp->ciphertext, sec_key,
- CIFS_CPHTXT_SIZE);
+ arc4_setkey(ctx_arc4, ses->auth_key.response, CIFS_SESS_KEY_SIZE);
+ arc4_crypt(ctx_arc4, ses->ntlmssp->ciphertext, sec_key,
+ CIFS_CPHTXT_SIZE);
/* make secondary_key/nonce as session key */
memcpy(ses->auth_key.response, sec_key, CIFS_SESS_KEY_SIZE);
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index dcb39d1b5958..1775c2b7528f 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -121,6 +121,46 @@ unsigned int dir_cache_timeout = 30;
module_param(dir_cache_timeout, uint, 0644);
MODULE_PARM_DESC(dir_cache_timeout, "Number of seconds to cache directory contents for which we have a lease. Default: 30 "
"Range: 1 to 65000 seconds, 0 to disable caching dir contents");
+/* Module-wide total cached dirents (in bytes) across all tcons */
+atomic64_t cifs_dircache_bytes_used = ATOMIC64_INIT(0);
+
+/*
+ * Write-only module parameter to drop all cached directory entries across
+ * all CIFS mounts. Echo a non-zero value to trigger.
+ */
+static void cifs_drop_all_dir_caches(void)
+{
+ struct TCP_Server_Info *server;
+ struct cifs_ses *ses;
+ struct cifs_tcon *tcon;
+
+ spin_lock(&cifs_tcp_ses_lock);
+ list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
+ list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ if (cifs_ses_exiting(ses))
+ continue;
+ list_for_each_entry(tcon, &ses->tcon_list, tcon_list)
+ invalidate_all_cached_dirs(tcon);
+ }
+ }
+ spin_unlock(&cifs_tcp_ses_lock);
+}
+
+static int cifs_param_set_drop_dir_cache(const char *val, const struct kernel_param *kp)
+{
+ bool bv;
+ int rc = kstrtobool(val, &bv);
+
+ if (rc)
+ return rc;
+ if (bv)
+ cifs_drop_all_dir_caches();
+ return 0;
+}
+
+module_param_call(drop_dir_cache, cifs_param_set_drop_dir_cache, NULL, NULL, 0200);
+MODULE_PARM_DESC(drop_dir_cache, "Write 1 to drop all cached directory entries across all CIFS mounts");
+
#ifdef CONFIG_CIFS_STATS2
unsigned int slow_rsp_threshold = 1;
module_param(slow_rsp_threshold, uint, 0644);
diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c
index 5223edf6d11a..fc67a6441c96 100644
--- a/fs/smb/client/dir.c
+++ b/fs/smb/client/dir.c
@@ -322,13 +322,14 @@ retry_open:
list_for_each_entry(parent_cfid, &tcon->cfids->entries, entry) {
if (parent_cfid->dentry == direntry->d_parent) {
cifs_dbg(FYI, "found a parent cached file handle\n");
- if (parent_cfid->has_lease && parent_cfid->time) {
+ if (is_valid_cached_dir(parent_cfid)) {
lease_flags
|= SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE;
memcpy(fid->parent_lease_key,
parent_cfid->fid.lease_key,
SMB2_LEASE_KEY_SIZE);
parent_cfid->dirents.is_valid = false;
+ parent_cfid->dirents.is_failed = true;
}
break;
}
@@ -484,8 +485,6 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
* in network traffic in the other paths.
*/
if (!(oflags & O_CREAT)) {
- struct dentry *res;
-
/*
* Check for hashed negative dentry. We have already revalidated
* the dentry and it is fine. No need to perform another lookup.
@@ -493,11 +492,7 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
if (!d_in_lookup(direntry))
return -ENOENT;
- res = cifs_lookup(inode, direntry, 0);
- if (IS_ERR(res))
- return PTR_ERR(res);
-
- return finish_no_open(file, res);
+ return finish_no_open(file, cifs_lookup(inode, direntry, 0));
}
xid = get_xid();
@@ -683,6 +678,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
const char *full_path;
void *page;
int retry_count = 0;
+ struct cached_fid *cfid = NULL;
xid = get_xid();
@@ -722,6 +718,28 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
cifs_dbg(FYI, "non-NULL inode in lookup\n");
} else {
cifs_dbg(FYI, "NULL inode in lookup\n");
+
+ /*
+ * We can only rely on negative dentries having the same
+ * spelling as the cached dirent if case insensitivity is
+ * forced on mount.
+ *
+ * XXX: if servers correctly announce Case Sensitivity Search
+ * on GetInfo of FileFSAttributeInformation, then we can take
+ * correct action even if case insensitive is not forced on
+ * mount.
+ */
+ if (pTcon->nocase && !open_cached_dir_by_dentry(pTcon, direntry->d_parent, &cfid)) {
+ /*
+ * dentry is negative and parent is fully cached:
+ * we can assume file does not exist
+ */
+ if (cfid->dirents.is_valid) {
+ close_cached_dir(cfid);
+ goto out;
+ }
+ close_cached_dir(cfid);
+ }
}
cifs_dbg(FYI, "Full path: %s inode = 0x%p\n",
full_path, d_inode(direntry));
@@ -755,6 +773,8 @@ again:
}
newInode = ERR_PTR(rc);
}
+
+out:
free_dentry_path(page);
cifs_put_tlink(tlink);
free_xid(xid);
@@ -765,7 +785,8 @@ static int
cifs_d_revalidate(struct inode *dir, const struct qstr *name,
struct dentry *direntry, unsigned int flags)
{
- struct inode *inode;
+ struct inode *inode = NULL;
+ struct cached_fid *cfid;
int rc;
if (flags & LOOKUP_RCU)
@@ -812,6 +833,21 @@ cifs_d_revalidate(struct inode *dir, const struct qstr *name,
return 1;
}
+ } else {
+ struct cifs_sb_info *cifs_sb = CIFS_SB(dir->i_sb);
+ struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+
+ if (!open_cached_dir_by_dentry(tcon, direntry->d_parent, &cfid)) {
+ /*
+ * dentry is negative and parent is fully cached:
+ * we can assume file does not exist
+ */
+ if (cfid->dirents.is_valid) {
+ close_cached_dir(cfid);
+ return 1;
+ }
+ close_cached_dir(cfid);
+ }
}
/*
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index 072383899e81..e60927b2a7c8 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -773,16 +773,14 @@ static int smb3_fs_context_parse_monolithic(struct fs_context *fc,
}
- len = 0;
value = strchr(key, '=');
if (value) {
if (value == key)
continue;
*value++ = 0;
- len = strlen(value);
}
- ret = vfs_parse_fs_string(fc, key, value, len);
+ ret = vfs_parse_fs_string(fc, key, value);
if (ret < 0)
break;
}
@@ -1820,6 +1818,13 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
goto cifs_parse_mount_err;
}
+ /*
+ * Multichannel is not meaningful if max_channels is 1.
+ * Force multichannel to false to ensure consistent configuration.
+ */
+ if (ctx->multichannel && ctx->max_channels == 1)
+ ctx->multichannel = false;
+
return 0;
cifs_parse_mount_err:
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index 7e9784080501..8bb544be401e 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -2704,7 +2704,7 @@ cifs_dentry_needs_reval(struct dentry *dentry)
return true;
if (!open_cached_dir_by_dentry(tcon, dentry->d_parent, &cfid)) {
- if (cfid->time && cifs_i->time > cfid->time) {
+ if (cifs_i->time > cfid->time) {
close_cached_dir(cfid);
return false;
}
diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c
index 4e5460206397..f0ce26622a14 100644
--- a/fs/smb/client/readdir.c
+++ b/fs/smb/client/readdir.c
@@ -874,39 +874,42 @@ static void finished_cached_dirents_count(struct cached_dirents *cde,
cde->is_valid = 1;
}
-static void add_cached_dirent(struct cached_dirents *cde,
- struct dir_context *ctx,
- const char *name, int namelen,
- struct cifs_fattr *fattr,
- struct file *file)
+static bool add_cached_dirent(struct cached_dirents *cde,
+ struct dir_context *ctx, const char *name,
+ int namelen, struct cifs_fattr *fattr,
+ struct file *file)
{
struct cached_dirent *de;
if (cde->file != file)
- return;
+ return false;
if (cde->is_valid || cde->is_failed)
- return;
+ return false;
if (ctx->pos != cde->pos) {
cde->is_failed = 1;
- return;
+ return false;
}
de = kzalloc(sizeof(*de), GFP_ATOMIC);
if (de == NULL) {
cde->is_failed = 1;
- return;
+ return false;
}
de->namelen = namelen;
de->name = kstrndup(name, namelen, GFP_ATOMIC);
if (de->name == NULL) {
kfree(de);
cde->is_failed = 1;
- return;
+ return false;
}
de->pos = ctx->pos;
memcpy(&de->fattr, fattr, sizeof(struct cifs_fattr));
list_add_tail(&de->entry, &cde->entries);
+ /* update accounting */
+ cde->entries_count++;
+ cde->bytes_used += sizeof(*de) + (size_t)namelen + 1;
+ return true;
}
static bool cifs_dir_emit(struct dir_context *ctx,
@@ -915,7 +918,8 @@ static bool cifs_dir_emit(struct dir_context *ctx,
struct cached_fid *cfid,
struct file *file)
{
- bool rc;
+ size_t delta_bytes = 0;
+ bool rc, added = false;
ino_t ino = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid);
rc = dir_emit(ctx, name, namelen, ino, fattr->cf_dtype);
@@ -923,10 +927,20 @@ static bool cifs_dir_emit(struct dir_context *ctx,
return rc;
if (cfid) {
+ /* Cost of this entry */
+ delta_bytes = sizeof(struct cached_dirent) + (size_t)namelen + 1;
+
mutex_lock(&cfid->dirents.de_mutex);
- add_cached_dirent(&cfid->dirents, ctx, name, namelen,
- fattr, file);
+ added = add_cached_dirent(&cfid->dirents, ctx, name, namelen,
+ fattr, file);
mutex_unlock(&cfid->dirents.de_mutex);
+
+ if (added) {
+ /* per-tcon then global for consistency with free path */
+ atomic64_add((long long)delta_bytes, &cfid->cfids->total_dirents_bytes);
+ atomic_long_inc(&cfid->cfids->total_dirents_entries);
+ atomic64_add((long long)delta_bytes, &cifs_dircache_bytes_used);
+ }
}
return rc;
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 4711a23c5b38..058050f744c0 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -954,11 +954,8 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
rc = open_cached_dir(xid, tcon, full_path, cifs_sb, true, &cfid);
if (!rc) {
- if (cfid->has_lease) {
- close_cached_dir(cfid);
- return 0;
- }
close_cached_dir(cfid);
+ return 0;
}
utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
@@ -4219,7 +4216,7 @@ fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len,
static void *smb2_aead_req_alloc(struct crypto_aead *tfm, const struct smb_rqst *rqst,
int num_rqst, const u8 *sig, u8 **iv,
struct aead_request **req, struct sg_table *sgt,
- unsigned int *num_sgs, size_t *sensitive_size)
+ unsigned int *num_sgs)
{
unsigned int req_size = sizeof(**req) + crypto_aead_reqsize(tfm);
unsigned int iv_size = crypto_aead_ivsize(tfm);
@@ -4236,9 +4233,8 @@ static void *smb2_aead_req_alloc(struct crypto_aead *tfm, const struct smb_rqst
len += req_size;
len = ALIGN(len, __alignof__(struct scatterlist));
len += array_size(*num_sgs, sizeof(struct scatterlist));
- *sensitive_size = len;
- p = kvzalloc(len, GFP_NOFS);
+ p = kzalloc(len, GFP_NOFS);
if (!p)
return ERR_PTR(-ENOMEM);
@@ -4252,16 +4248,14 @@ static void *smb2_aead_req_alloc(struct crypto_aead *tfm, const struct smb_rqst
static void *smb2_get_aead_req(struct crypto_aead *tfm, struct smb_rqst *rqst,
int num_rqst, const u8 *sig, u8 **iv,
- struct aead_request **req, struct scatterlist **sgl,
- size_t *sensitive_size)
+ struct aead_request **req, struct scatterlist **sgl)
{
struct sg_table sgtable = {};
unsigned int skip, num_sgs, i, j;
ssize_t rc;
void *p;
- p = smb2_aead_req_alloc(tfm, rqst, num_rqst, sig, iv, req, &sgtable,
- &num_sgs, sensitive_size);
+ p = smb2_aead_req_alloc(tfm, rqst, num_rqst, sig, iv, req, &sgtable, &num_sgs);
if (IS_ERR(p))
return ERR_CAST(p);
@@ -4350,7 +4344,6 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
DECLARE_CRYPTO_WAIT(wait);
unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
void *creq;
- size_t sensitive_size;
rc = smb2_get_enc_key(server, le64_to_cpu(tr_hdr->SessionId), enc, key);
if (rc) {
@@ -4376,8 +4369,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
return rc;
}
- creq = smb2_get_aead_req(tfm, rqst, num_rqst, sign, &iv, &req, &sg,
- &sensitive_size);
+ creq = smb2_get_aead_req(tfm, rqst, num_rqst, sign, &iv, &req, &sg);
if (IS_ERR(creq))
return PTR_ERR(creq);
@@ -4407,7 +4399,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
if (!rc && enc)
memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE);
- kvfree_sensitive(creq, sensitive_size);
+ kfree_sensitive(creq);
return rc;
}
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 1c63d2c9cc9c..42e2d4ea344d 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -240,8 +240,8 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
*/
if (smb2_command != SMB2_TREE_DISCONNECT) {
spin_unlock(&tcon->tc_lock);
- cifs_dbg(FYI, "can not send cmd %d while umounting\n",
- smb2_command);
+ cifs_tcon_dbg(FYI, "can not send cmd %d while umounting\n",
+ smb2_command);
return -ENODEV;
}
}
@@ -296,9 +296,9 @@ again:
return 0;
}
spin_unlock(&ses->chan_lock);
- cifs_dbg(FYI, "sess reconnect mask: 0x%lx, tcon reconnect: %d",
- tcon->ses->chans_need_reconnect,
- tcon->need_reconnect);
+ cifs_tcon_dbg(FYI, "sess reconnect mask: 0x%lx, tcon reconnect: %d\n",
+ tcon->ses->chans_need_reconnect,
+ tcon->need_reconnect);
mutex_lock(&ses->session_mutex);
/*
@@ -392,11 +392,11 @@ skip_sess_setup:
rc = cifs_tree_connect(0, tcon);
- cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc);
+ cifs_tcon_dbg(FYI, "reconnect tcon rc = %d\n", rc);
if (rc) {
/* If sess reconnected but tcon didn't, something strange ... */
mutex_unlock(&ses->session_mutex);
- cifs_dbg(VFS, "reconnect tcon failed rc = %d\n", rc);
+ cifs_tcon_dbg(VFS, "reconnect tcon failed rc = %d\n", rc);
goto out;
}
@@ -442,8 +442,8 @@ skip_sess_setup:
from_reconnect);
goto skip_add_channels;
} else if (rc)
- cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n",
- __func__, rc);
+ cifs_tcon_dbg(FYI, "%s: failed to query server interfaces: %d\n",
+ __func__, rc);
if (ses->chan_max > ses->chan_count &&
ses->iface_count &&
diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c
index a61ba7f3fb86..051cd9dbba13 100644
--- a/fs/smb/client/transport.c
+++ b/fs/smb/client/transport.c
@@ -22,6 +22,7 @@
#include <linux/mempool.h>
#include <linux/sched/signal.h>
#include <linux/task_io_accounting_ops.h>
+#include <linux/task_work.h>
#include "cifspdu.h"
#include "cifsglob.h"
#include "cifsproto.h"
@@ -173,9 +174,16 @@ smb_send_kvec(struct TCP_Server_Info *server, struct msghdr *smb_msg,
* send a packet. In most cases if we fail to send
* after the retries we will kill the socket and
* reconnect which may clear the network problem.
+ *
+ * Even if regular signals are masked, EINTR might be
+ * propagated from sk_stream_wait_memory() to here when
+ * TIF_NOTIFY_SIGNAL is used for task work. For example,
+ * certain io_uring completions will use that. Treat
+ * having EINTR with pending task work the same as EAGAIN
+ * to avoid unnecessary reconnects.
*/
rc = sock_sendmsg(ssocket, smb_msg);
- if (rc == -EAGAIN) {
+ if (rc == -EAGAIN || unlikely(rc == -EINTR && task_work_pending(current))) {
retries++;
if (retries >= 14 ||
(!server->noblocksnd && (retries > 2))) {
@@ -323,8 +331,7 @@ int __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
break;
total_len += sent;
}
-
-}
+ }
unmask:
sigprocmask(SIG_SETMASK, &oldmask, NULL);
diff --git a/fs/smb/common/Makefile b/fs/smb/common/Makefile
index c66dbbc1469c..9e0730a385fb 100644
--- a/fs/smb/common/Makefile
+++ b/fs/smb/common/Makefile
@@ -3,5 +3,4 @@
# Makefile for Linux filesystem routines that are shared by client and server.
#
-obj-$(CONFIG_SMBFS) += cifs_arc4.o
obj-$(CONFIG_SMBFS) += cifs_md4.o
diff --git a/fs/smb/common/arc4.h b/fs/smb/common/arc4.h
deleted file mode 100644
index 12e71ec033a1..000000000000
--- a/fs/smb/common/arc4.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * Common values for ARC4 Cipher Algorithm
- */
-
-#ifndef _CRYPTO_ARC4_H
-#define _CRYPTO_ARC4_H
-
-#include <linux/types.h>
-
-#define ARC4_MIN_KEY_SIZE 1
-#define ARC4_MAX_KEY_SIZE 256
-#define ARC4_BLOCK_SIZE 1
-
-struct arc4_ctx {
- u32 S[256];
- u32 x, y;
-};
-
-int cifs_arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len);
-void cifs_arc4_crypt(struct arc4_ctx *ctx, u8 *out, const u8 *in, unsigned int len);
-
-#endif /* _CRYPTO_ARC4_H */
diff --git a/fs/smb/common/cifs_arc4.c b/fs/smb/common/cifs_arc4.c
deleted file mode 100644
index df360ca47826..000000000000
--- a/fs/smb/common/cifs_arc4.c
+++ /dev/null
@@ -1,75 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Cryptographic API
- *
- * ARC4 Cipher Algorithm
- *
- * Jon Oberheide <jon@oberheide.org>
- */
-
-#include <linux/module.h>
-#include "arc4.h"
-
-MODULE_DESCRIPTION("ARC4 Cipher Algorithm");
-MODULE_LICENSE("GPL");
-
-int cifs_arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len)
-{
- int i, j = 0, k = 0;
-
- ctx->x = 1;
- ctx->y = 0;
-
- for (i = 0; i < 256; i++)
- ctx->S[i] = i;
-
- for (i = 0; i < 256; i++) {
- u32 a = ctx->S[i];
-
- j = (j + in_key[k] + a) & 0xff;
- ctx->S[i] = ctx->S[j];
- ctx->S[j] = a;
- if (++k >= key_len)
- k = 0;
- }
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(cifs_arc4_setkey);
-
-void cifs_arc4_crypt(struct arc4_ctx *ctx, u8 *out, const u8 *in, unsigned int len)
-{
- u32 *const S = ctx->S;
- u32 x, y, a, b;
- u32 ty, ta, tb;
-
- if (len == 0)
- return;
-
- x = ctx->x;
- y = ctx->y;
-
- a = S[x];
- y = (y + a) & 0xff;
- b = S[y];
-
- do {
- S[y] = a;
- a = (a + b) & 0xff;
- S[x] = b;
- x = (x + 1) & 0xff;
- ta = S[x];
- ty = (y + ta) & 0xff;
- tb = S[ty];
- *out++ = *in++ ^ S[a];
- if (--len == 0)
- break;
- y = ty;
- a = ta;
- b = tb;
- } while (true);
-
- ctx->x = x;
- ctx->y = y;
-}
-EXPORT_SYMBOL_GPL(cifs_arc4_crypt);
diff --git a/fs/smb/server/Kconfig b/fs/smb/server/Kconfig
index 4a23a5e7e8fe..098cac98d31e 100644
--- a/fs/smb/server/Kconfig
+++ b/fs/smb/server/Kconfig
@@ -10,6 +10,7 @@ config SMB_SERVER
select CRYPTO_MD5
select CRYPTO_HMAC
select CRYPTO_ECB
+ select CRYPTO_LIB_ARC4
select CRYPTO_LIB_DES
select CRYPTO_LIB_SHA256
select CRYPTO_SHA256
diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c
index d99871c21451..b4020bb55a26 100644
--- a/fs/smb/server/auth.c
+++ b/fs/smb/server/auth.c
@@ -20,6 +20,7 @@
#include "glob.h"
#include <linux/fips.h>
+#include <crypto/arc4.h>
#include <crypto/des.h>
#include "server.h"
@@ -29,7 +30,6 @@
#include "mgmt/user_config.h"
#include "crypto_ctx.h"
#include "transport_ipc.h"
-#include "../common/arc4.h"
/*
* Fixed format data defining GSS header and fixed string
@@ -365,10 +365,9 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob,
if (!ctx_arc4)
return -ENOMEM;
- cifs_arc4_setkey(ctx_arc4, sess->sess_key,
- SMB2_NTLMV2_SESSKEY_SIZE);
- cifs_arc4_crypt(ctx_arc4, sess->sess_key,
- (char *)authblob + sess_key_off, sess_key_len);
+ arc4_setkey(ctx_arc4, sess->sess_key, SMB2_NTLMV2_SESSKEY_SIZE);
+ arc4_crypt(ctx_arc4, sess->sess_key,
+ (char *)authblob + sess_key_off, sess_key_len);
kfree_sensitive(ctx_arc4);
}
diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c
index 91a934411134..b6b4f1286b9c 100644
--- a/fs/smb/server/connection.c
+++ b/fs/smb/server/connection.c
@@ -19,7 +19,7 @@ static DEFINE_MUTEX(init_lock);
static struct ksmbd_conn_ops default_conn_ops;
-LIST_HEAD(conn_list);
+DEFINE_HASHTABLE(conn_list, CONN_HASH_BITS);
DECLARE_RWSEM(conn_list_lock);
/**
@@ -33,7 +33,7 @@ DECLARE_RWSEM(conn_list_lock);
void ksmbd_conn_free(struct ksmbd_conn *conn)
{
down_write(&conn_list_lock);
- list_del(&conn->conns_list);
+ hash_del(&conn->hlist);
up_write(&conn_list_lock);
xa_destroy(&conn->sessions);
@@ -77,7 +77,6 @@ struct ksmbd_conn *ksmbd_conn_alloc(void)
init_waitqueue_head(&conn->req_running_q);
init_waitqueue_head(&conn->r_count_q);
- INIT_LIST_HEAD(&conn->conns_list);
INIT_LIST_HEAD(&conn->requests);
INIT_LIST_HEAD(&conn->async_requests);
spin_lock_init(&conn->request_lock);
@@ -90,19 +89,17 @@ struct ksmbd_conn *ksmbd_conn_alloc(void)
init_rwsem(&conn->session_lock);
- down_write(&conn_list_lock);
- list_add(&conn->conns_list, &conn_list);
- up_write(&conn_list_lock);
return conn;
}
bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c)
{
struct ksmbd_conn *t;
+ int bkt;
bool ret = false;
down_read(&conn_list_lock);
- list_for_each_entry(t, &conn_list, conns_list) {
+ hash_for_each(conn_list, bkt, t, hlist) {
if (memcmp(t->ClientGUID, c->ClientGUID, SMB2_CLIENT_GUID_SIZE))
continue;
@@ -163,9 +160,10 @@ void ksmbd_conn_unlock(struct ksmbd_conn *conn)
void ksmbd_all_conn_set_status(u64 sess_id, u32 status)
{
struct ksmbd_conn *conn;
+ int bkt;
down_read(&conn_list_lock);
- list_for_each_entry(conn, &conn_list, conns_list) {
+ hash_for_each(conn_list, bkt, conn, hlist) {
if (conn->binding || xa_load(&conn->sessions, sess_id))
WRITE_ONCE(conn->status, status);
}
@@ -181,14 +179,14 @@ int ksmbd_conn_wait_idle_sess_id(struct ksmbd_conn *curr_conn, u64 sess_id)
{
struct ksmbd_conn *conn;
int rc, retry_count = 0, max_timeout = 120;
- int rcount = 1;
+ int rcount = 1, bkt;
retry_idle:
if (retry_count >= max_timeout)
return -EIO;
down_read(&conn_list_lock);
- list_for_each_entry(conn, &conn_list, conns_list) {
+ hash_for_each(conn_list, bkt, conn, hlist) {
if (conn->binding || xa_load(&conn->sessions, sess_id)) {
if (conn == curr_conn)
rcount = 2;
@@ -480,10 +478,11 @@ static void stop_sessions(void)
{
struct ksmbd_conn *conn;
struct ksmbd_transport *t;
+ int bkt;
again:
down_read(&conn_list_lock);
- list_for_each_entry(conn, &conn_list, conns_list) {
+ hash_for_each(conn_list, bkt, conn, hlist) {
t = conn->transport;
ksmbd_conn_set_exiting(conn);
if (t->ops->shutdown) {
@@ -494,7 +493,7 @@ again:
}
up_read(&conn_list_lock);
- if (!list_empty(&conn_list)) {
+ if (!hash_empty(conn_list)) {
msleep(100);
goto again;
}
diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h
index 07b43634262a..7f9bcd9817b5 100644
--- a/fs/smb/server/connection.h
+++ b/fs/smb/server/connection.h
@@ -54,11 +54,12 @@ struct ksmbd_conn {
u8 inet6_addr[16];
#endif
};
+ unsigned int inet_hash;
char *request_buf;
struct ksmbd_transport *transport;
struct nls_table *local_nls;
struct unicode_map *um;
- struct list_head conns_list;
+ struct hlist_node hlist;
struct rw_semaphore session_lock;
/* smb session 1 per user */
struct xarray sessions;
@@ -153,7 +154,8 @@ struct ksmbd_transport {
#define KSMBD_TCP_SEND_TIMEOUT (5 * HZ)
#define KSMBD_TCP_PEER_SOCKADDR(c) ((struct sockaddr *)&((c)->peer_addr))
-extern struct list_head conn_list;
+#define CONN_HASH_BITS 12
+extern DECLARE_HASHTABLE(conn_list, CONN_HASH_BITS);
extern struct rw_semaphore conn_list_lock;
bool ksmbd_conn_alive(struct ksmbd_conn *conn);
diff --git a/fs/smb/server/ksmbd_netlink.h b/fs/smb/server/ksmbd_netlink.h
index 3f07a612c05b..8ccd57fd904b 100644
--- a/fs/smb/server/ksmbd_netlink.h
+++ b/fs/smb/server/ksmbd_netlink.h
@@ -112,10 +112,11 @@ struct ksmbd_startup_request {
__u32 smbd_max_io_size; /* smbd read write size */
__u32 max_connections; /* Number of maximum simultaneous connections */
__s8 bind_interfaces_only;
- __s8 reserved[503]; /* Reserved room */
+ __u32 max_ip_connections; /* Number of maximum connection per ip address */
+ __s8 reserved[499]; /* Reserved room */
__u32 ifc_list_sz; /* interfaces list size */
__s8 ____payload[];
-};
+} __packed;
#define KSMBD_STARTUP_CONFIG_INTERFACES(s) ((s)->____payload)
diff --git a/fs/smb/server/mgmt/share_config.c b/fs/smb/server/mgmt/share_config.c
index d3d5f99bdd34..c9b1108d6e96 100644
--- a/fs/smb/server/mgmt/share_config.c
+++ b/fs/smb/server/mgmt/share_config.c
@@ -19,7 +19,7 @@
#include "../transport_ipc.h"
#include "../misc.h"
-#define SHARE_HASH_BITS 3
+#define SHARE_HASH_BITS 12
static DEFINE_HASHTABLE(shares_table, SHARE_HASH_BITS);
static DECLARE_RWSEM(shares_table_lock);
diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c
index 9dec4c2940bc..6fa025374f2f 100644
--- a/fs/smb/server/mgmt/user_session.c
+++ b/fs/smb/server/mgmt/user_session.c
@@ -18,7 +18,7 @@
static DEFINE_IDA(session_ida);
-#define SESSION_HASH_BITS 3
+#define SESSION_HASH_BITS 12
static DEFINE_HASHTABLE(sessions_table, SESSION_HASH_BITS);
static DECLARE_RWSEM(sessions_table_lock);
@@ -104,29 +104,32 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name)
if (!entry)
return -ENOMEM;
- down_read(&sess->rpc_lock);
entry->method = method;
entry->id = id = ksmbd_ipc_id_alloc();
if (id < 0)
goto free_entry;
+
+ down_write(&sess->rpc_lock);
old = xa_store(&sess->rpc_handle_list, id, entry, KSMBD_DEFAULT_GFP);
- if (xa_is_err(old))
+ if (xa_is_err(old)) {
+ up_write(&sess->rpc_lock);
goto free_id;
+ }
resp = ksmbd_rpc_open(sess, id);
- if (!resp)
- goto erase_xa;
+ if (!resp) {
+ xa_erase(&sess->rpc_handle_list, entry->id);
+ up_write(&sess->rpc_lock);
+ goto free_id;
+ }
- up_read(&sess->rpc_lock);
+ up_write(&sess->rpc_lock);
kvfree(resp);
return id;
-erase_xa:
- xa_erase(&sess->rpc_handle_list, entry->id);
free_id:
ksmbd_rpc_id_free(entry->id);
free_entry:
kfree(entry);
- up_read(&sess->rpc_lock);
return -EINVAL;
}
@@ -144,9 +147,14 @@ void ksmbd_session_rpc_close(struct ksmbd_session *sess, int id)
int ksmbd_session_rpc_method(struct ksmbd_session *sess, int id)
{
struct ksmbd_session_rpc *entry;
+ int method;
+ down_read(&sess->rpc_lock);
entry = xa_load(&sess->rpc_handle_list, id);
- return entry ? entry->method : 0;
+ method = entry ? entry->method : 0;
+ up_read(&sess->rpc_lock);
+
+ return method;
}
void ksmbd_session_destroy(struct ksmbd_session *sess)
diff --git a/fs/smb/server/server.h b/fs/smb/server/server.h
index 995555febe7d..b8a7317be86b 100644
--- a/fs/smb/server/server.h
+++ b/fs/smb/server/server.h
@@ -43,6 +43,7 @@ struct ksmbd_server_config {
unsigned int auth_mechs;
unsigned int max_connections;
unsigned int max_inflight_req;
+ unsigned int max_ip_connections;
char *conf[SERVER_CONF_WORK_GROUP + 1];
struct task_struct *dh_task;
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 0c069eff80b7..ab1d45fcebde 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -5629,7 +5629,8 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
if (!work->tcon->posix_extensions) {
pr_err("client doesn't negotiate with SMB3.1.1 POSIX Extensions\n");
- rc = -EOPNOTSUPP;
+ path_put(&path);
+ return -EOPNOTSUPP;
} else {
info = (struct filesystem_posix_info *)(rsp->Buffer);
info->OptimalTransferSize = cpu_to_le32(stfs.f_bsize);
@@ -7361,7 +7362,7 @@ int smb2_lock(struct ksmbd_work *work)
int nolock = 0;
LIST_HEAD(lock_list);
LIST_HEAD(rollback_list);
- int prior_lock = 0;
+ int prior_lock = 0, bkt;
WORK_BUFFERS(work, req, rsp);
@@ -7471,7 +7472,7 @@ int smb2_lock(struct ksmbd_work *work)
nolock = 1;
/* check locks in connection list */
down_read(&conn_list_lock);
- list_for_each_entry(conn, &conn_list, conns_list) {
+ hash_for_each(conn_list, bkt, conn, hlist) {
spin_lock(&conn->llist_lock);
list_for_each_entry_safe(cmp_lock, tmp2, &conn->lock_list, clist) {
if (file_inode(cmp_lock->fl->c.flc_file) !=
diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c
index 2a3e2b0ce557..2aa1b29bea08 100644
--- a/fs/smb/server/transport_ipc.c
+++ b/fs/smb/server/transport_ipc.c
@@ -335,6 +335,9 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req)
if (req->max_connections)
server_conf.max_connections = req->max_connections;
+ if (req->max_ip_connections)
+ server_conf.max_ip_connections = req->max_ip_connections;
+
ret = ksmbd_set_netbios_name(req->netbios_name);
ret |= ksmbd_set_server_string(req->server_string);
ret |= ksmbd_set_work_group(req->work_group);
diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c
index 9e644a0daf1c..b3077766d6ec 100644
--- a/fs/smb/server/transport_rdma.c
+++ b/fs/smb/server/transport_rdma.c
@@ -425,6 +425,11 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id)
conn = ksmbd_conn_alloc();
if (!conn)
goto err;
+
+ down_write(&conn_list_lock);
+ hash_add(conn_list, &conn->hlist, 0);
+ up_write(&conn_list_lock);
+
conn->transport = KSMBD_TRANS(t);
KSMBD_TRANS(t)->conn = conn;
KSMBD_TRANS(t)->ops = &ksmbd_smb_direct_transport_ops;
diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c
index 4337df97987d..7a1e3dcc2cde 100644
--- a/fs/smb/server/transport_tcp.c
+++ b/fs/smb/server/transport_tcp.c
@@ -86,13 +86,21 @@ static struct tcp_transport *alloc_transport(struct socket *client_sk)
}
#if IS_ENABLED(CONFIG_IPV6)
- if (client_sk->sk->sk_family == AF_INET6)
+ if (client_sk->sk->sk_family == AF_INET6) {
memcpy(&conn->inet6_addr, &client_sk->sk->sk_v6_daddr, 16);
- else
+ conn->inet_hash = ipv6_addr_hash(&client_sk->sk->sk_v6_daddr);
+ } else {
conn->inet_addr = inet_sk(client_sk->sk)->inet_daddr;
+ conn->inet_hash = ipv4_addr_hash(inet_sk(client_sk->sk)->inet_daddr);
+ }
#else
conn->inet_addr = inet_sk(client_sk->sk)->inet_daddr;
+ conn->inet_hash = ipv4_addr_hash(inet_sk(client_sk->sk)->inet_daddr);
#endif
+ down_write(&conn_list_lock);
+ hash_add(conn_list, &conn->hlist, conn->inet_hash);
+ up_write(&conn_list_lock);
+
conn->transport = KSMBD_TRANS(t);
KSMBD_TRANS(t)->conn = conn;
KSMBD_TRANS(t)->ops = &ksmbd_tcp_transport_ops;
@@ -170,17 +178,6 @@ static struct kvec *get_conn_iovec(struct tcp_transport *t, unsigned int nr_segs
return new_iov;
}
-static unsigned short ksmbd_tcp_get_port(const struct sockaddr *sa)
-{
- switch (sa->sa_family) {
- case AF_INET:
- return ntohs(((struct sockaddr_in *)sa)->sin_port);
- case AF_INET6:
- return ntohs(((struct sockaddr_in6 *)sa)->sin6_port);
- }
- return 0;
-}
-
/**
* ksmbd_tcp_new_connection() - create a new tcp session on mount
* @client_sk: socket associated with new connection
@@ -192,7 +189,6 @@ static unsigned short ksmbd_tcp_get_port(const struct sockaddr *sa)
*/
static int ksmbd_tcp_new_connection(struct socket *client_sk)
{
- struct sockaddr *csin;
int rc = 0;
struct tcp_transport *t;
struct task_struct *handler;
@@ -203,27 +199,26 @@ static int ksmbd_tcp_new_connection(struct socket *client_sk)
return -ENOMEM;
}
- csin = KSMBD_TCP_PEER_SOCKADDR(KSMBD_TRANS(t)->conn);
- if (kernel_getpeername(client_sk, csin) < 0) {
- pr_err("client ip resolution failed\n");
- rc = -EINVAL;
- goto out_error;
- }
-
+#if IS_ENABLED(CONFIG_IPV6)
+ if (client_sk->sk->sk_family == AF_INET6)
+ handler = kthread_run(ksmbd_conn_handler_loop,
+ KSMBD_TRANS(t)->conn, "ksmbd:%pI6c",
+ &KSMBD_TRANS(t)->conn->inet6_addr);
+ else
+ handler = kthread_run(ksmbd_conn_handler_loop,
+ KSMBD_TRANS(t)->conn, "ksmbd:%pI4",
+ &KSMBD_TRANS(t)->conn->inet_addr);
+#else
handler = kthread_run(ksmbd_conn_handler_loop,
- KSMBD_TRANS(t)->conn,
- "ksmbd:%u",
- ksmbd_tcp_get_port(csin));
+ KSMBD_TRANS(t)->conn, "ksmbd:%pI4",
+ &KSMBD_TRANS(t)->conn->inet_addr);
+#endif
if (IS_ERR(handler)) {
pr_err("cannot start conn thread\n");
rc = PTR_ERR(handler);
free_transport(t);
}
return rc;
-
-out_error:
- free_transport(t);
- return rc;
}
/**
@@ -237,7 +232,8 @@ static int ksmbd_kthread_fn(void *p)
struct socket *client_sk = NULL;
struct interface *iface = (struct interface *)p;
struct ksmbd_conn *conn;
- int ret;
+ int ret, inet_hash;
+ unsigned int max_ip_conns;
while (!kthread_should_stop()) {
mutex_lock(&iface->sock_release_lock);
@@ -255,34 +251,49 @@ static int ksmbd_kthread_fn(void *p)
continue;
}
+ if (!server_conf.max_ip_connections)
+ goto skip_max_ip_conns_limit;
+
/*
* Limits repeated connections from clients with the same IP.
*/
+#if IS_ENABLED(CONFIG_IPV6)
+ if (client_sk->sk->sk_family == AF_INET6)
+ inet_hash = ipv6_addr_hash(&client_sk->sk->sk_v6_daddr);
+ else
+ inet_hash = ipv4_addr_hash(inet_sk(client_sk->sk)->inet_daddr);
+#else
+ inet_hash = ipv4_addr_hash(inet_sk(client_sk->sk)->inet_daddr);
+#endif
+
+ max_ip_conns = 0;
down_read(&conn_list_lock);
- list_for_each_entry(conn, &conn_list, conns_list)
+ hash_for_each_possible(conn_list, conn, hlist, inet_hash) {
#if IS_ENABLED(CONFIG_IPV6)
if (client_sk->sk->sk_family == AF_INET6) {
if (memcmp(&client_sk->sk->sk_v6_daddr,
- &conn->inet6_addr, 16) == 0) {
- ret = -EAGAIN;
- break;
- }
+ &conn->inet6_addr, 16) == 0)
+ max_ip_conns++;
} else if (inet_sk(client_sk->sk)->inet_daddr ==
- conn->inet_addr) {
- ret = -EAGAIN;
- break;
- }
+ conn->inet_addr)
+ max_ip_conns++;
#else
if (inet_sk(client_sk->sk)->inet_daddr ==
- conn->inet_addr) {
+ conn->inet_addr)
+ max_ip_conns++;
+#endif
+ if (server_conf.max_ip_connections <= max_ip_conns) {
+ pr_info_ratelimited("Maximum IP connections exceeded (%u/%u)\n",
+ max_ip_conns, server_conf.max_ip_connections);
ret = -EAGAIN;
break;
}
-#endif
+ }
up_read(&conn_list_lock);
if (ret == -EAGAIN)
continue;
+skip_max_ip_conns_limit:
if (server_conf.max_connections &&
atomic_inc_return(&active_num_conn) >= server_conf.max_connections) {
pr_info_ratelimited("Limit the maximum number of connections(%u)\n",
@@ -468,12 +479,13 @@ static int create_socket(struct interface *iface)
struct socket *ksmbd_socket;
bool ipv4 = false;
- ret = sock_create(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &ksmbd_socket);
+ ret = sock_create_kern(current->nsproxy->net_ns, PF_INET6, SOCK_STREAM,
+ IPPROTO_TCP, &ksmbd_socket);
if (ret) {
if (ret != -EAFNOSUPPORT)
pr_err("Can't create socket for ipv6, fallback to ipv4: %d\n", ret);
- ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP,
- &ksmbd_socket);
+ ret = sock_create_kern(current->nsproxy->net_ns, PF_INET,
+ SOCK_STREAM, IPPROTO_TCP, &ksmbd_socket);
if (ret) {
pr_err("Can't create socket for ipv4: %d\n", ret);
goto out_clear;
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index 1cfa688904b2..891ed2dc2b73 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -20,6 +20,7 @@
#include <linux/sched/xacct.h>
#include <linux/crc32c.h>
#include <linux/namei.h>
+#include <linux/splice.h>
#include "glob.h"
#include "oplock.h"
@@ -72,7 +73,7 @@ static int ksmbd_vfs_path_lookup(struct ksmbd_share_config *share_conf,
{
struct qstr last;
struct filename *filename __free(putname) = NULL;
- struct path *root_share_path = &share_conf->vfs_path;
+ const struct path *root_share_path = &share_conf->vfs_path;
int err, type;
struct dentry *d;
@@ -1305,7 +1306,7 @@ int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *filepath,
caseless, true);
}
-void ksmbd_vfs_kern_path_unlock(struct path *path)
+void ksmbd_vfs_kern_path_unlock(const struct path *path)
{
/* While lock is still held, ->d_parent is safe */
inode_unlock(d_inode(path->dentry->d_parent));
@@ -1829,8 +1830,19 @@ int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work,
if (src_off + len > src_file_size)
return -E2BIG;
- ret = vfs_copy_file_range(src_fp->filp, src_off,
- dst_fp->filp, dst_off, len, 0);
+ /*
+ * vfs_copy_file_range does not allow overlapped copying
+ * within the same file.
+ */
+ if (file_inode(src_fp->filp) == file_inode(dst_fp->filp) &&
+ dst_off + len > src_off &&
+ dst_off < src_off + len)
+ ret = do_splice_direct(src_fp->filp, &src_off,
+ dst_fp->filp, &dst_off,
+ min_t(size_t, len, MAX_RW_COUNT), 0);
+ else
+ ret = vfs_copy_file_range(src_fp->filp, src_off,
+ dst_fp->filp, dst_off, len, 0);
if (ret == -EOPNOTSUPP || ret == -EXDEV)
ret = vfs_copy_file_range(src_fp->filp, src_off,
dst_fp->filp, dst_off, len,
@@ -1855,7 +1867,7 @@ void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock)
}
int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap,
- struct path *path)
+ const struct path *path)
{
struct posix_acl_state acl_state;
struct posix_acl *acls;
@@ -1908,7 +1920,7 @@ int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap,
}
int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap,
- struct path *path, struct inode *parent_inode)
+ const struct path *path, struct inode *parent_inode)
{
struct posix_acl *acls;
struct posix_acl_entry *pace;
diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h
index d47472f3e30b..df6421b4590b 100644
--- a/fs/smb/server/vfs.h
+++ b/fs/smb/server/vfs.h
@@ -123,7 +123,7 @@ int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name,
int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name,
unsigned int flags,
struct path *path, bool caseless);
-void ksmbd_vfs_kern_path_unlock(struct path *path);
+void ksmbd_vfs_kern_path_unlock(const struct path *path);
struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work,
const char *name,
unsigned int flags,
@@ -164,8 +164,8 @@ int ksmbd_vfs_get_dos_attrib_xattr(struct mnt_idmap *idmap,
struct dentry *dentry,
struct xattr_dos_attrib *da);
int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap,
- struct path *path);
+ const struct path *path);
int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap,
- struct path *path,
+ const struct path *path,
struct inode *parent_inode);
#endif /* __KSMBD_VFS_H__ */
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index ce7d661d5ad8..1582e0637a7e 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -307,7 +307,8 @@ static int fill_meta_index(struct inode *inode, int index,
all_done:
*index_block = cur_index_block;
*index_offset = cur_offset;
- *data_block = cur_data_block;
+ if (data_block)
+ *data_block = cur_data_block;
/*
* Scale cache index (cache slot entry) to index
@@ -324,17 +325,15 @@ failed:
* Get the on-disk location and compressed size of the datablock
* specified by index. Fill_meta_index() does most of the work.
*/
-static int read_blocklist(struct inode *inode, int index, u64 *block)
+static int read_blocklist_ptrs(struct inode *inode, int index, u64 *start,
+ int *offset, u64 *block)
{
- u64 start;
long long blks;
- int offset;
__le32 size;
- int res = fill_meta_index(inode, index, &start, &offset, block);
+ int res = fill_meta_index(inode, index, start, offset, block);
- TRACE("read_blocklist: res %d, index %d, start 0x%llx, offset"
- " 0x%x, block 0x%llx\n", res, index, start, offset,
- *block);
+ TRACE("read_blocklist: res %d, index %d, start 0x%llx, offset 0x%x, block 0x%llx\n",
+ res, index, *start, *offset, block ? *block : 0);
if (res < 0)
return res;
@@ -346,22 +345,31 @@ static int read_blocklist(struct inode *inode, int index, u64 *block)
* extra block indexes needed.
*/
if (res < index) {
- blks = read_indexes(inode->i_sb, index - res, &start, &offset);
+ blks = read_indexes(inode->i_sb, index - res, start, offset);
if (blks < 0)
return (int) blks;
- *block += blks;
+ if (block)
+ *block += blks;
}
/*
* Read length of block specified by index.
*/
- res = squashfs_read_metadata(inode->i_sb, &size, &start, &offset,
+ res = squashfs_read_metadata(inode->i_sb, &size, start, offset,
sizeof(size));
if (res < 0)
return res;
return squashfs_block_size(size);
}
+static inline int read_blocklist(struct inode *inode, int index, u64 *block)
+{
+ u64 start;
+ int offset;
+
+ return read_blocklist_ptrs(inode, index, &start, &offset, block);
+}
+
static bool squashfs_fill_page(struct folio *folio,
struct squashfs_cache_entry *buffer, size_t offset,
size_t avail)
@@ -658,7 +666,114 @@ skip_pages:
kfree(pages);
}
+static loff_t seek_hole_data(struct file *file, loff_t offset, int whence)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct super_block *sb = inode->i_sb;
+ struct squashfs_sb_info *msblk = sb->s_fs_info;
+ u64 start, index = offset >> msblk->block_log;
+ u64 file_end = (i_size_read(inode) + msblk->block_size - 1) >> msblk->block_log;
+ int s_offset, length;
+ __le32 *blist = NULL;
+
+ /* reject offset if negative or beyond file end */
+ if ((unsigned long long)offset >= i_size_read(inode))
+ return -ENXIO;
+
+ /* is offset within tailend and is tailend packed into a fragment? */
+ if (index + 1 == file_end &&
+ squashfs_i(inode)->fragment_block != SQUASHFS_INVALID_BLK) {
+ if (whence == SEEK_DATA)
+ return offset;
+
+ /* there is an implicit hole at the end of any file */
+ return i_size_read(inode);
+ }
+
+ length = read_blocklist_ptrs(inode, index, &start, &s_offset, NULL);
+ if (length < 0)
+ return length;
+
+ /* nothing more to do if offset matches desired whence value */
+ if ((length == 0 && whence == SEEK_HOLE) ||
+ (length && whence == SEEK_DATA))
+ return offset;
+
+ /* skip scanning forwards if we're at file end */
+ if (++ index == file_end)
+ goto not_found;
+
+ blist = kmalloc(SQUASHFS_SCAN_INDEXES << 2, GFP_KERNEL);
+ if (blist == NULL) {
+ ERROR("%s: Failed to allocate block_list\n", __func__);
+ return -ENOMEM;
+ }
+
+ while (index < file_end) {
+ int i, indexes = min(file_end - index, SQUASHFS_SCAN_INDEXES);
+
+ offset = squashfs_read_metadata(sb, blist, &start, &s_offset, indexes << 2);
+ if (offset < 0)
+ goto finished;
+
+ for (i = 0; i < indexes; i++) {
+ length = squashfs_block_size(blist[i]);
+ if (length < 0) {
+ offset = length;
+ goto finished;
+ }
+
+ /* does this block match desired whence value? */
+ if ((length == 0 && whence == SEEK_HOLE) ||
+ (length && whence == SEEK_DATA)) {
+ offset = (index + i) << msblk->block_log;
+ goto finished;
+ }
+ }
+
+ index += indexes;
+ }
+
+not_found:
+ /* whence value determines what happens */
+ if (whence == SEEK_DATA)
+ offset = -ENXIO;
+ else
+ /* there is an implicit hole at the end of any file */
+ offset = i_size_read(inode);
+
+finished:
+ kfree(blist);
+ return offset;
+}
+
+static loff_t squashfs_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct inode *inode = file->f_mapping->host;
+
+ switch (whence) {
+ default:
+ return generic_file_llseek(file, offset, whence);
+ case SEEK_DATA:
+ case SEEK_HOLE:
+ offset = seek_hole_data(file, offset, whence);
+ break;
+ }
+
+ if (offset < 0)
+ return offset;
+
+ return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
+}
+
const struct address_space_operations squashfs_aops = {
.read_folio = squashfs_read_folio,
.readahead = squashfs_readahead
};
+
+const struct file_operations squashfs_file_operations = {
+ .llseek = squashfs_llseek,
+ .read_iter = generic_file_read_iter,
+ .mmap_prepare = generic_file_readonly_mmap_prepare,
+ .splice_read = filemap_splice_read
+};
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index d5918eba27e3..cceae3b78698 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -68,6 +68,10 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode,
inode->i_mode = le16_to_cpu(sqsh_ino->mode);
inode->i_size = 0;
+ /* File type must not be set at this moment, for it will later be set by the caller. */
+ if (inode->i_mode & S_IFMT)
+ err = -EIO;
+
return err;
}
@@ -140,8 +144,17 @@ int squashfs_read_inode(struct inode *inode, long long ino)
if (err < 0)
goto failed_read;
+ inode->i_size = le32_to_cpu(sqsh_ino->file_size);
frag = le32_to_cpu(sqsh_ino->fragment);
if (frag != SQUASHFS_INVALID_FRAG) {
+ /*
+ * the file cannot have a fragment (tailend) and have a
+ * file size a multiple of the block size
+ */
+ if ((inode->i_size & (msblk->block_size - 1)) == 0) {
+ err = -EINVAL;
+ goto failed_read;
+ }
frag_offset = le32_to_cpu(sqsh_ino->offset);
frag_size = squashfs_frag_lookup(sb, frag, &frag_blk);
if (frag_size < 0) {
@@ -155,8 +168,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
}
set_nlink(inode, 1);
- inode->i_size = le32_to_cpu(sqsh_ino->file_size);
- inode->i_fop = &generic_ro_fops;
+ inode->i_fop = &squashfs_file_operations;
inode->i_mode |= S_IFREG;
inode->i_blocks = ((inode->i_size - 1) >> 9) + 1;
squashfs_i(inode)->fragment_block = frag_blk;
@@ -165,6 +177,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block);
squashfs_i(inode)->block_list_start = block;
squashfs_i(inode)->offset = offset;
+ squashfs_i(inode)->parent = 0;
inode->i_data.a_ops = &squashfs_aops;
TRACE("File inode %x:%x, start_block %llx, block_list_start "
@@ -183,8 +196,21 @@ int squashfs_read_inode(struct inode *inode, long long ino)
if (err < 0)
goto failed_read;
+ inode->i_size = le64_to_cpu(sqsh_ino->file_size);
+ if (inode->i_size < 0) {
+ err = -EINVAL;
+ goto failed_read;
+ }
frag = le32_to_cpu(sqsh_ino->fragment);
if (frag != SQUASHFS_INVALID_FRAG) {
+ /*
+ * the file cannot have a fragment (tailend) and have a
+ * file size a multiple of the block size
+ */
+ if ((inode->i_size & (msblk->block_size - 1)) == 0) {
+ err = -EINVAL;
+ goto failed_read;
+ }
frag_offset = le32_to_cpu(sqsh_ino->offset);
frag_size = squashfs_frag_lookup(sb, frag, &frag_blk);
if (frag_size < 0) {
@@ -199,9 +225,8 @@ int squashfs_read_inode(struct inode *inode, long long ino)
xattr_id = le32_to_cpu(sqsh_ino->xattr);
set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
- inode->i_size = le64_to_cpu(sqsh_ino->file_size);
inode->i_op = &squashfs_inode_ops;
- inode->i_fop = &generic_ro_fops;
+ inode->i_fop = &squashfs_file_operations;
inode->i_mode |= S_IFREG;
inode->i_blocks = (inode->i_size -
le64_to_cpu(sqsh_ino->sparse) + 511) >> 9;
@@ -212,6 +237,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
squashfs_i(inode)->start = le64_to_cpu(sqsh_ino->start_block);
squashfs_i(inode)->block_list_start = block;
squashfs_i(inode)->offset = offset;
+ squashfs_i(inode)->parent = 0;
inode->i_data.a_ops = &squashfs_aops;
TRACE("File inode %x:%x, start_block %llx, block_list_start "
@@ -292,6 +318,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
inode->i_mode |= S_IFLNK;
squashfs_i(inode)->start = block;
squashfs_i(inode)->offset = offset;
+ squashfs_i(inode)->parent = 0;
if (type == SQUASHFS_LSYMLINK_TYPE) {
__le32 xattr;
@@ -329,6 +356,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
rdev = le32_to_cpu(sqsh_ino->rdev);
init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
+ squashfs_i(inode)->parent = 0;
TRACE("Device inode %x:%x, rdev %x\n",
SQUASHFS_INODE_BLK(ino), offset, rdev);
@@ -353,6 +381,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
rdev = le32_to_cpu(sqsh_ino->rdev);
init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
+ squashfs_i(inode)->parent = 0;
TRACE("Device inode %x:%x, rdev %x\n",
SQUASHFS_INODE_BLK(ino), offset, rdev);
@@ -373,6 +402,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
inode->i_mode |= S_IFSOCK;
set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
init_special_inode(inode, inode->i_mode, 0);
+ squashfs_i(inode)->parent = 0;
break;
}
case SQUASHFS_LFIFO_TYPE:
@@ -392,6 +422,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
inode->i_op = &squashfs_inode_ops;
set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
init_special_inode(inode, inode->i_mode, 0);
+ squashfs_i(inode)->parent = 0;
break;
}
default:
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 218868b20f16..4851bd964502 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -107,6 +107,7 @@ extern const struct address_space_operations squashfs_aops;
/* inode.c */
extern const struct inode_operations squashfs_inode_ops;
+extern const struct file_operations squashfs_file_operations;
/* namei.c */
extern const struct inode_operations squashfs_dir_inode_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 95f8e8901768..a955d9369749 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -208,6 +208,7 @@ static inline int squashfs_block_size(__le32 raw)
#define SQUASHFS_META_INDEXES (SQUASHFS_METADATA_SIZE / sizeof(unsigned int))
#define SQUASHFS_META_ENTRIES 127
#define SQUASHFS_META_SLOTS 8
+#define SQUASHFS_SCAN_INDEXES 1024
struct meta_entry {
u64 data_block;
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
index 2c82d6f2a456..8e497ac07b9a 100644
--- a/fs/squashfs/squashfs_fs_i.h
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -16,6 +16,7 @@ struct squashfs_inode_info {
u64 xattr;
unsigned int xattr_size;
int xattr_count;
+ int parent;
union {
struct {
u64 fragment_block;
@@ -27,7 +28,6 @@ struct squashfs_inode_info {
u64 dir_idx_start;
int dir_idx_offset;
int dir_idx_cnt;
- int parent;
};
};
struct inode vfs_inode;
diff --git a/fs/stat.c b/fs/stat.c
index f95c1dc3eaa4..6c79661e1b96 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -293,7 +293,7 @@ static int statx_lookup_flags(int flags)
return lookup_flags;
}
-static int vfs_statx_path(struct path *path, int flags, struct kstat *stat,
+static int vfs_statx_path(const struct path *path, int flags, struct kstat *stat,
u32 request_mask)
{
int error = vfs_getattr(path, stat, request_mask, flags);
diff --git a/fs/super.c b/fs/super.c
index f4fa0e93c463..5bab94fb7e03 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -323,7 +323,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
if (!s)
return NULL;
- INIT_LIST_HEAD(&s->s_mounts);
s->s_user_ns = get_user_ns(user_ns);
init_rwsem(&s->s_umount);
lockdep_set_class(&s->s_umount, &type->s_umount_key);
@@ -408,7 +407,7 @@ static void __put_super(struct super_block *s)
list_del_init(&s->s_list);
WARN_ON(s->s_dentry_lru.node);
WARN_ON(s->s_inode_lru.node);
- WARN_ON(!list_empty(&s->s_mounts));
+ WARN_ON(s->s_mounts);
call_rcu(&s->rcu, destroy_super_rcu);
}
}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index f24aa98e6869..a79d73f28aa7 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -2272,6 +2272,9 @@ int udf_current_aext(struct inode *inode, struct extent_position *epos,
if (check_add_overflow(sizeof(struct allocExtDesc),
le32_to_cpu(header->lengthAllocDescs), &alen))
return -1;
+
+ if (alen > epos->bh->b_size)
+ return -1;
}
switch (iinfo->i_alloc_type) {
diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c
index 770e29ec3557..42bedc4ec7af 100644
--- a/fs/vboxsf/dir.c
+++ b/fs/vboxsf/dir.c
@@ -315,46 +315,39 @@ static int vboxsf_dir_atomic_open(struct inode *parent, struct dentry *dentry,
{
struct vboxsf_sbi *sbi = VBOXSF_SBI(parent->i_sb);
struct vboxsf_handle *sf_handle;
- struct dentry *res = NULL;
u64 handle;
int err;
if (d_in_lookup(dentry)) {
- res = vboxsf_dir_lookup(parent, dentry, 0);
- if (IS_ERR(res))
- return PTR_ERR(res);
-
- if (res)
- dentry = res;
+ struct dentry *res = vboxsf_dir_lookup(parent, dentry, 0);
+ if (res || d_really_is_positive(dentry))
+ return finish_no_open(file, res);
}
/* Only creates */
- if (!(flags & O_CREAT) || d_really_is_positive(dentry))
- return finish_no_open(file, res);
+ if (!(flags & O_CREAT))
+ return finish_no_open(file, NULL);
err = vboxsf_dir_create(parent, dentry, mode, false, flags & O_EXCL, &handle);
if (err)
- goto out;
+ return err;
sf_handle = vboxsf_create_sf_handle(d_inode(dentry), handle, SHFL_CF_ACCESS_READWRITE);
if (IS_ERR(sf_handle)) {
vboxsf_close(sbi->root, handle);
- err = PTR_ERR(sf_handle);
- goto out;
+ return PTR_ERR(sf_handle);
}
err = finish_open(file, dentry, generic_file_open);
if (err) {
/* This also closes the handle passed to vboxsf_create_sf_handle() */
vboxsf_release_sf_handle(d_inode(dentry), sf_handle);
- goto out;
+ return err;
}
file->private_data = sf_handle;
file->f_mode |= FMODE_CREATED;
-out:
- dput(res);
- return err;
+ return 0;
}
static int vboxsf_dir_unlink(struct inode *parent, struct dentry *dentry)