summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs_vfs.h23
-rw-r--r--fs/9p/vfs_file.c6
-rw-r--r--fs/9p/vfs_inode.c23
-rw-r--r--fs/9p/vfs_inode_dotl.c27
-rw-r--r--fs/9p/vfs_super.c6
-rw-r--r--fs/Kconfig7
-rw-r--r--fs/Makefile2
-rw-r--r--fs/afs/internal.h9
-rw-r--r--fs/afs/mntpt.c149
-rw-r--r--fs/afs/super.c432
-rw-r--r--fs/afs/volume.c4
-rw-r--r--fs/btrfs/ctree.h2
-rw-r--r--fs/btrfs/disk-io.c2
-rw-r--r--fs/btrfs/extent-tree.c74
-rw-r--r--fs/btrfs/extent_io.c4
-rw-r--r--fs/btrfs/ioctl.c21
-rw-r--r--fs/btrfs/qgroup.c13
-rw-r--r--fs/btrfs/root-tree.c8
-rw-r--r--fs/btrfs/zstd.c6
-rw-r--r--fs/ceph/caps.c72
-rw-r--r--fs/ceph/debugfs.c27
-rw-r--r--fs/ceph/dir.c455
-rw-r--r--fs/ceph/file.c13
-rw-r--r--fs/ceph/inode.c52
-rw-r--r--fs/ceph/mds_client.c698
-rw-r--r--fs/ceph/mds_client.h43
-rw-r--r--fs/ceph/snap.c159
-rw-r--r--fs/ceph/super.c21
-rw-r--r--fs/ceph/super.h43
-rw-r--r--fs/ceph/xattr.c20
-rw-r--r--fs/cifs/cifs_debug.c11
-rw-r--r--fs/cifs/cifs_ioctl.h3
-rw-r--r--fs/cifs/cifsglob.h16
-rw-r--r--fs/cifs/connect.c4
-rw-r--r--fs/cifs/dir.c107
-rw-r--r--fs/cifs/file.c14
-rw-r--r--fs/cifs/smb1ops.c126
-rw-r--r--fs/cifs/smb2inode.c87
-rw-r--r--fs/cifs/smb2ops.c423
-rw-r--r--fs/cifs/smb2pdu.c145
-rw-r--r--fs/cifs/smb2pdu.h7
-rw-r--r--fs/cifs/smb2proto.h7
-rw-r--r--fs/cifs/smb2status.h6
-rw-r--r--fs/cifs/trace.h124
-rw-r--r--fs/cifs/transport.c226
-rw-r--r--fs/dax.c25
-rw-r--r--fs/devpts/inode.c1
-rw-r--r--fs/ext4/Kconfig2
-rw-r--r--fs/ext4/ext4.h9
-rw-r--r--fs/ext4/extents.c29
-rw-r--r--fs/ext4/hash.c2
-rw-r--r--fs/ext4/indirect.c6
-rw-r--r--fs/ext4/inode.c21
-rw-r--r--fs/ext4/ioctl.c101
-rw-r--r--fs/ext4/mballoc.c7
-rw-r--r--fs/ext4/page-io.c4
-rw-r--r--fs/ext4/resize.c3
-rw-r--r--fs/ext4/super.c1
-rw-r--r--fs/ext4/sysfs.c13
-rw-r--r--fs/ext4/xattr.c3
-rw-r--r--fs/f2fs/checkpoint.c20
-rw-r--r--fs/f2fs/data.c59
-rw-r--r--fs/f2fs/debug.c19
-rw-r--r--fs/f2fs/dir.c15
-rw-r--r--fs/f2fs/extent_cache.c2
-rw-r--r--fs/f2fs/f2fs.h77
-rw-r--r--fs/f2fs/file.c46
-rw-r--r--fs/f2fs/inline.c12
-rw-r--r--fs/f2fs/inode.c15
-rw-r--r--fs/f2fs/namei.c3
-rw-r--r--fs/f2fs/node.c6
-rw-r--r--fs/f2fs/segment.c80
-rw-r--r--fs/f2fs/segment.h2
-rw-r--r--fs/f2fs/super.c109
-rw-r--r--fs/f2fs/sysfs.c17
-rw-r--r--fs/f2fs/trace.c20
-rw-r--r--fs/f2fs/xattr.c25
-rw-r--r--fs/f2fs/xattr.h6
-rw-r--r--fs/filesystems.c4
-rw-r--r--fs/fs_context.c642
-rw-r--r--fs/fs_parser.c447
-rw-r--r--fs/fuse/control.c4
-rw-r--r--fs/fuse/cuse.c7
-rw-r--r--fs/fuse/dev.c115
-rw-r--r--fs/fuse/dir.c54
-rw-r--r--fs/fuse/file.c342
-rw-r--r--fs/fuse/fuse_i.h28
-rw-r--r--fs/fuse/inode.c28
-rw-r--r--fs/fuse/readdir.c4
-rw-r--r--fs/hpfs/hpfs.h8
-rw-r--r--fs/hugetlbfs/inode.c358
-rw-r--r--fs/internal.h13
-rw-r--r--fs/io_uring.c2
-rw-r--r--fs/jbd2/checkpoint.c17
-rw-r--r--fs/jbd2/commit.c6
-rw-r--r--fs/jbd2/journal.c90
-rw-r--r--fs/jbd2/transaction.c83
-rw-r--r--fs/kernfs/kernfs-internal.h1
-rw-r--r--fs/kernfs/mount.c119
-rw-r--r--fs/lockd/clnt4xdr.c14
-rw-r--r--fs/lockd/clntxdr.c14
-rw-r--r--fs/mount.h5
-rw-r--r--fs/namei.c5
-rw-r--r--fs/namespace.c395
-rw-r--r--fs/nfs/callback_xdr.c64
-rw-r--r--fs/nfs/delegation.c22
-rw-r--r--fs/nfs/delegation.h1
-rw-r--r--fs/nfs/dir.c98
-rw-r--r--fs/nfs/direct.c7
-rw-r--r--fs/nfs/file.c44
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c225
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h75
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c161
-rw-r--r--fs/nfs/inode.c33
-rw-r--r--fs/nfs/internal.h5
-rw-r--r--fs/nfs/io.c12
-rw-r--r--fs/nfs/namespace.c8
-rw-r--r--fs/nfs/nfs2xdr.c124
-rw-r--r--fs/nfs/nfs3acl.c2
-rw-r--r--fs/nfs/nfs3xdr.c209
-rw-r--r--fs/nfs/nfs42.h3
-rw-r--r--fs/nfs/nfs42proc.c164
-rw-r--r--fs/nfs/nfs42xdr.c130
-rw-r--r--fs/nfs/nfs4client.c33
-rw-r--r--fs/nfs/nfs4namespace.c5
-rw-r--r--fs/nfs/nfs4proc.c138
-rw-r--r--fs/nfs/nfs4session.c7
-rw-r--r--fs/nfs/nfs4session.h7
-rw-r--r--fs/nfs/nfs4state.c1
-rw-r--r--fs/nfs/nfs4trace.h25
-rw-r--r--fs/nfs/nfs4xdr.c530
-rw-r--r--fs/nfs/nfstrace.c1
-rw-r--r--fs/nfs/nfstrace.h85
-rw-r--r--fs/nfs/pagelist.c47
-rw-r--r--fs/nfs/pnfs.c35
-rw-r--r--fs/nfs/pnfs.h2
-rw-r--r--fs/nfs/pnfs_dev.c13
-rw-r--r--fs/nfs/read.c2
-rw-r--r--fs/nfs/super.c2
-rw-r--r--fs/nfs/unlink.c8
-rw-r--r--fs/nfs/write.c19
-rw-r--r--fs/nfsd/nfs3proc.c18
-rw-r--r--fs/nfsd/nfs3xdr.c5
-rw-r--r--fs/nfsd/nfs4callback.c17
-rw-r--r--fs/nfsd/nfs4state.c8
-rw-r--r--fs/nfsd/nfsctl.c2
-rw-r--r--fs/nilfs2/btnode.c2
-rw-r--r--fs/notify/fanotify/fanotify_user.c12
-rw-r--r--fs/notify/inotify/inotify_user.c7
-rw-r--r--fs/orangefs/inode.c7
-rw-r--r--fs/overlayfs/copy_up.c59
-rw-r--r--fs/overlayfs/overlayfs.h2
-rw-r--r--fs/overlayfs/util.c55
-rw-r--r--fs/pipe.c32
-rw-r--r--fs/pnode.c5
-rw-r--r--fs/pnode.h3
-rw-r--r--fs/proc/base.c71
-rw-r--r--fs/proc/inode.c52
-rw-r--r--fs/proc/internal.h5
-rw-r--r--fs/proc/root.c236
-rw-r--r--fs/read_write.c10
-rw-r--r--fs/splice.c8
-rw-r--r--fs/stat.c12
-rw-r--r--fs/super.c344
-rw-r--r--fs/sysfs/mount.c75
-rw-r--r--fs/ubifs/ioctl.c8
-rw-r--r--fs/udf/inode.c4
-rw-r--r--fs/udf/truncate.c8
-rw-r--r--fs/udf/udfdecl.h2
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c37
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c18
171 files changed, 7306 insertions, 3457 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 5a0db6dec8d1..aaee1e6584e6 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -40,6 +40,9 @@
*/
#define P9_LOCK_TIMEOUT (30*HZ)
+/* flags for v9fs_stat2inode() & v9fs_stat2inode_dotl() */
+#define V9FS_STAT2INODE_KEEP_ISIZE 1
+
extern struct file_system_type v9fs_fs_type;
extern const struct address_space_operations v9fs_addr_operations;
extern const struct file_operations v9fs_file_operations;
@@ -61,8 +64,10 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
struct inode *inode, umode_t mode, dev_t);
void v9fs_evict_inode(struct inode *inode);
ino_t v9fs_qid2ino(struct p9_qid *qid);
-void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
-void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *);
+void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
+ struct super_block *sb, unsigned int flags);
+void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
+ unsigned int flags);
int v9fs_dir_release(struct inode *inode, struct file *filp);
int v9fs_file_open(struct inode *inode, struct file *file);
void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
@@ -83,4 +88,18 @@ static inline void v9fs_invalidate_inode_attr(struct inode *inode)
}
int v9fs_open_to_dotl_flags(int flags);
+
+static inline void v9fs_i_size_write(struct inode *inode, loff_t i_size)
+{
+ /*
+ * 32-bit need the lock, concurrent updates could break the
+ * sequences and make i_size_read() loop forever.
+ * 64-bit updates are atomic and can skip the locking.
+ */
+ if (sizeof(i_size) > sizeof(long))
+ spin_lock(&inode->i_lock);
+ i_size_write(inode, i_size);
+ if (sizeof(i_size) > sizeof(long))
+ spin_unlock(&inode->i_lock);
+}
#endif
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index a25efa782fcc..9a1125305d84 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -446,7 +446,11 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
i_size = i_size_read(inode);
if (iocb->ki_pos > i_size) {
inode_add_bytes(inode, iocb->ki_pos - i_size);
- i_size_write(inode, iocb->ki_pos);
+ /*
+ * Need to serialize against i_size_write() in
+ * v9fs_stat2inode()
+ */
+ v9fs_i_size_write(inode, iocb->ki_pos);
}
return retval;
}
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 85ff859d3af5..72b779bc0942 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -538,7 +538,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
if (retval)
goto error;
- v9fs_stat2inode(st, inode, sb);
+ v9fs_stat2inode(st, inode, sb, 0);
v9fs_cache_inode_get_cookie(inode);
unlock_new_inode(inode);
return inode;
@@ -1092,7 +1092,7 @@ v9fs_vfs_getattr(const struct path *path, struct kstat *stat,
if (IS_ERR(st))
return PTR_ERR(st);
- v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb);
+ v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb, 0);
generic_fillattr(d_inode(dentry), stat);
p9stat_free(st);
@@ -1170,12 +1170,13 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
* @stat: Plan 9 metadata (mistat) structure
* @inode: inode to populate
* @sb: superblock of filesystem
+ * @flags: control flags (e.g. V9FS_STAT2INODE_KEEP_ISIZE)
*
*/
void
v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
- struct super_block *sb)
+ struct super_block *sb, unsigned int flags)
{
umode_t mode;
char ext[32];
@@ -1216,10 +1217,11 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
mode = p9mode2perm(v9ses, stat);
mode |= inode->i_mode & ~S_IALLUGO;
inode->i_mode = mode;
- i_size_write(inode, stat->length);
+ if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE))
+ v9fs_i_size_write(inode, stat->length);
/* not real number of blocks, but 512 byte ones ... */
- inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
+ inode->i_blocks = (stat->length + 512 - 1) >> 9;
v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR;
}
@@ -1416,9 +1418,9 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
{
int umode;
dev_t rdev;
- loff_t i_size;
struct p9_wstat *st;
struct v9fs_session_info *v9ses;
+ unsigned int flags;
v9ses = v9fs_inode2v9ses(inode);
st = p9_client_stat(fid);
@@ -1431,16 +1433,13 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
if ((inode->i_mode & S_IFMT) != (umode & S_IFMT))
goto out;
- spin_lock(&inode->i_lock);
/*
* We don't want to refresh inode->i_size,
* because we may have cached data
*/
- i_size = inode->i_size;
- v9fs_stat2inode(st, inode, inode->i_sb);
- if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
- inode->i_size = i_size;
- spin_unlock(&inode->i_lock);
+ flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ?
+ V9FS_STAT2INODE_KEEP_ISIZE : 0;
+ v9fs_stat2inode(st, inode, inode->i_sb, flags);
out:
p9stat_free(st);
kfree(st);
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 4823e1c46999..a950a927a626 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -143,7 +143,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
if (retval)
goto error;
- v9fs_stat2inode_dotl(st, inode);
+ v9fs_stat2inode_dotl(st, inode, 0);
v9fs_cache_inode_get_cookie(inode);
retval = v9fs_get_acl(inode, fid);
if (retval)
@@ -496,7 +496,7 @@ v9fs_vfs_getattr_dotl(const struct path *path, struct kstat *stat,
if (IS_ERR(st))
return PTR_ERR(st);
- v9fs_stat2inode_dotl(st, d_inode(dentry));
+ v9fs_stat2inode_dotl(st, d_inode(dentry), 0);
generic_fillattr(d_inode(dentry), stat);
/* Change block size to what the server returned */
stat->blksize = st->st_blksize;
@@ -607,11 +607,13 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
* v9fs_stat2inode_dotl - populate an inode structure with stat info
* @stat: stat structure
* @inode: inode to populate
+ * @flags: ctrl flags (e.g. V9FS_STAT2INODE_KEEP_ISIZE)
*
*/
void
-v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
+v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
+ unsigned int flags)
{
umode_t mode;
struct v9fs_inode *v9inode = V9FS_I(inode);
@@ -631,7 +633,8 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
mode |= inode->i_mode & ~S_IALLUGO;
inode->i_mode = mode;
- i_size_write(inode, stat->st_size);
+ if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE))
+ v9fs_i_size_write(inode, stat->st_size);
inode->i_blocks = stat->st_blocks;
} else {
if (stat->st_result_mask & P9_STATS_ATIME) {
@@ -661,8 +664,9 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
}
if (stat->st_result_mask & P9_STATS_RDEV)
inode->i_rdev = new_decode_dev(stat->st_rdev);
- if (stat->st_result_mask & P9_STATS_SIZE)
- i_size_write(inode, stat->st_size);
+ if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) &&
+ stat->st_result_mask & P9_STATS_SIZE)
+ v9fs_i_size_write(inode, stat->st_size);
if (stat->st_result_mask & P9_STATS_BLOCKS)
inode->i_blocks = stat->st_blocks;
}
@@ -928,9 +932,9 @@ v9fs_vfs_get_link_dotl(struct dentry *dentry,
int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
{
- loff_t i_size;
struct p9_stat_dotl *st;
struct v9fs_session_info *v9ses;
+ unsigned int flags;
v9ses = v9fs_inode2v9ses(inode);
st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
@@ -942,16 +946,13 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT))
goto out;
- spin_lock(&inode->i_lock);
/*
* We don't want to refresh inode->i_size,
* because we may have cached data
*/
- i_size = inode->i_size;
- v9fs_stat2inode_dotl(st, inode);
- if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
- inode->i_size = i_size;
- spin_unlock(&inode->i_lock);
+ flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ?
+ V9FS_STAT2INODE_KEEP_ISIZE : 0;
+ v9fs_stat2inode_dotl(st, inode, flags);
out:
kfree(st);
return 0;
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 48ce50484e80..d13d35cf69c0 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -92,7 +92,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
return ret;
if (v9ses->cache)
- sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
sb->s_flags |= SB_ACTIVE | SB_DIRSYNC;
if (!v9ses->cache)
@@ -172,7 +172,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
goto release_sb;
}
d_inode(root)->i_ino = v9fs_qid2ino(&st->qid);
- v9fs_stat2inode_dotl(st, d_inode(root));
+ v9fs_stat2inode_dotl(st, d_inode(root), 0);
kfree(st);
} else {
struct p9_wstat *st = NULL;
@@ -183,7 +183,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
}
d_inode(root)->i_ino = v9fs_qid2ino(&st->qid);
- v9fs_stat2inode(st, d_inode(root), sb);
+ v9fs_stat2inode(st, d_inode(root), sb, 0);
p9stat_free(st);
kfree(st);
diff --git a/fs/Kconfig b/fs/Kconfig
index 2557506051a3..3e6d3101f3ff 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -8,6 +8,13 @@ menu "File systems"
config DCACHE_WORD_ACCESS
bool
+config VALIDATE_FS_PARSER
+ bool "Validate filesystem parameter description"
+ default y
+ help
+ Enable this to perform validation of the parameter description for a
+ filesystem when it is registered.
+
if BLOCK
config FS_IOMAP
diff --git a/fs/Makefile b/fs/Makefile
index 7bff9abecfa4..427fec226fae 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -13,7 +13,7 @@ obj-y := open.o read_write.o file_table.o super.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o splice.o sync.o utimes.o d_path.o \
stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
- fs_types.o
+ fs_types.o fs_context.o fs_parser.o
ifeq ($(CONFIG_BLOCK),y)
obj-y += buffer.o block_dev.o direct-io.o mpage.o
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 8871b9e8645f..bb1f244b2b3a 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -36,15 +36,14 @@
struct pagevec;
struct afs_call;
-struct afs_mount_params {
- bool rwpath; /* T if the parent should be considered R/W */
+struct afs_fs_context {
bool force; /* T to force cell type */
bool autocell; /* T if set auto mount operation */
bool dyn_root; /* T if dynamic root */
+ bool no_cell; /* T if the source is "none" (for dynroot) */
afs_voltype_t type; /* type of volume requested */
- int volnamesz; /* size of volume name */
+ unsigned int volnamesz; /* size of volume name */
const char *volname; /* name of volume to mount */
- struct net *net_ns; /* Network namespace in effect */
struct afs_net *net; /* the AFS net namespace stuff */
struct afs_cell *cell; /* cell in which to find volume */
struct afs_volume *volume; /* volume record */
@@ -1274,7 +1273,7 @@ static inline struct afs_volume *__afs_get_volume(struct afs_volume *volume)
return volume;
}
-extern struct afs_volume *afs_create_volume(struct afs_mount_params *);
+extern struct afs_volume *afs_create_volume(struct afs_fs_context *);
extern void afs_activate_volume(struct afs_volume *);
extern void afs_deactivate_volume(struct afs_volume *);
extern void afs_put_volume(struct afs_cell *, struct afs_volume *);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 2e51c6994148..eecd8b699186 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -17,6 +17,7 @@
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/gfp.h>
+#include <linux/fs_context.h>
#include "internal.h"
@@ -47,6 +48,8 @@ static DECLARE_DELAYED_WORK(afs_mntpt_expiry_timer, afs_mntpt_expiry_timed_out);
static unsigned long afs_mntpt_expiry_timeout = 10 * 60;
+static const char afs_root_volume[] = "root.cell";
+
/*
* no valid lookup procedure on this sort of dir
*/
@@ -68,108 +71,112 @@ static int afs_mntpt_open(struct inode *inode, struct file *file)
}
/*
- * create a vfsmount to be automounted
+ * Set the parameters for the proposed superblock.
*/
-static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
+static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
{
- struct afs_super_info *as;
- struct vfsmount *mnt;
- struct afs_vnode *vnode;
- struct page *page;
- char *devname, *options;
- bool rwpath = false;
+ struct afs_fs_context *ctx = fc->fs_private;
+ struct afs_super_info *src_as = AFS_FS_S(mntpt->d_sb);
+ struct afs_vnode *vnode = AFS_FS_I(d_inode(mntpt));
+ struct afs_cell *cell;
+ const char *p;
int ret;
- _enter("{%pd}", mntpt);
-
- BUG_ON(!d_inode(mntpt));
-
- ret = -ENOMEM;
- devname = (char *) get_zeroed_page(GFP_KERNEL);
- if (!devname)
- goto error_no_devname;
-
- options = (char *) get_zeroed_page(GFP_KERNEL);
- if (!options)
- goto error_no_options;
+ if (fc->net_ns != src_as->net_ns) {
+ put_net(fc->net_ns);
+ fc->net_ns = get_net(src_as->net_ns);
+ }
- vnode = AFS_FS_I(d_inode(mntpt));
+ if (src_as->volume && src_as->volume->type == AFSVL_RWVOL) {
+ ctx->type = AFSVL_RWVOL;
+ ctx->force = true;
+ }
+ if (ctx->cell) {
+ afs_put_cell(ctx->net, ctx->cell);
+ ctx->cell = NULL;
+ }
if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) {
/* if the directory is a pseudo directory, use the d_name */
- static const char afs_root_cell[] = ":root.cell.";
unsigned size = mntpt->d_name.len;
- ret = -ENOENT;
- if (size < 2 || size > AFS_MAXCELLNAME)
- goto error_no_page;
+ if (size < 2)
+ return -ENOENT;
+ p = mntpt->d_name.name;
if (mntpt->d_name.name[0] == '.') {
- devname[0] = '%';
- memcpy(devname + 1, mntpt->d_name.name + 1, size - 1);
- memcpy(devname + size, afs_root_cell,
- sizeof(afs_root_cell));
- rwpath = true;
- } else {
- devname[0] = '#';
- memcpy(devname + 1, mntpt->d_name.name, size);
- memcpy(devname + size + 1, afs_root_cell,
- sizeof(afs_root_cell));
+ size--;
+ p++;
+ ctx->type = AFSVL_RWVOL;
+ ctx->force = true;
}
+ if (size > AFS_MAXCELLNAME)
+ return -ENAMETOOLONG;
+
+ cell = afs_lookup_cell(ctx->net, p, size, NULL, false);
+ if (IS_ERR(cell)) {
+ pr_err("kAFS: unable to lookup cell '%pd'\n", mntpt);
+ return PTR_ERR(cell);
+ }
+ ctx->cell = cell;
+
+ ctx->volname = afs_root_volume;
+ ctx->volnamesz = sizeof(afs_root_volume) - 1;
} else {
/* read the contents of the AFS special symlink */
+ struct page *page;
loff_t size = i_size_read(d_inode(mntpt));
char *buf;
- ret = -EINVAL;
+ if (src_as->cell)
+ ctx->cell = afs_get_cell(src_as->cell);
+
if (size > PAGE_SIZE - 1)
- goto error_no_page;
+ return -EINVAL;
page = read_mapping_page(d_inode(mntpt)->i_mapping, 0, NULL);
- if (IS_ERR(page)) {
- ret = PTR_ERR(page);
- goto error_no_page;
- }
+ if (IS_ERR(page))
+ return PTR_ERR(page);
if (PageError(page)) {
ret = afs_bad(AFS_FS_I(d_inode(mntpt)), afs_file_error_mntpt);
- goto error;
+ put_page(page);
+ return ret;
}
- buf = kmap_atomic(page);
- memcpy(devname, buf, size);
- kunmap_atomic(buf);
+ buf = kmap(page);
+ ret = vfs_parse_fs_string(fc, "source", buf, size);
+ kunmap(page);
put_page(page);
- page = NULL;
+ if (ret < 0)
+ return ret;
}
- /* work out what options we want */
- as = AFS_FS_S(mntpt->d_sb);
- if (as->cell) {
- memcpy(options, "cell=", 5);
- strcpy(options + 5, as->cell->name);
- if ((as->volume && as->volume->type == AFSVL_RWVOL) || rwpath)
- strcat(options, ",rwpath");
- }
+ return 0;
+}
- /* try and do the mount */
- _debug("--- attempting mount %s -o %s ---", devname, options);
- mnt = vfs_submount(mntpt, &afs_fs_type, devname, options);
- _debug("--- mount result %p ---", mnt);
+/*
+ * create a vfsmount to be automounted
+ */
+static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
+{
+ struct fs_context *fc;
+ struct vfsmount *mnt;
+ int ret;
- free_page((unsigned long) devname);
- free_page((unsigned long) options);
- _leave(" = %p", mnt);
- return mnt;
+ BUG_ON(!d_inode(mntpt));
-error:
- put_page(page);
-error_no_page:
- free_page((unsigned long) options);
-error_no_options:
- free_page((unsigned long) devname);
-error_no_devname:
- _leave(" = %d", ret);
- return ERR_PTR(ret);
+ fc = fs_context_for_submount(&afs_fs_type, mntpt);
+ if (IS_ERR(fc))
+ return ERR_CAST(fc);
+
+ ret = afs_mntpt_set_params(fc, mntpt);
+ if (!ret)
+ mnt = fc_mount(fc);
+ else
+ mnt = ERR_PTR(ret);
+
+ put_fs_context(fc);
+ return mnt;
}
/*
diff --git a/fs/afs/super.c b/fs/afs/super.c
index dcd07fe99871..5adf012b8e27 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -1,6 +1,6 @@
/* AFS superblock handling
*
- * Copyright (c) 2002, 2007 Red Hat, Inc. All rights reserved.
+ * Copyright (c) 2002, 2007, 2018 Red Hat, Inc. All rights reserved.
*
* This software may be freely redistributed under the terms of the
* GNU General Public License.
@@ -21,7 +21,7 @@
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
-#include <linux/parser.h>
+#include <linux/fs_parser.h>
#include <linux/statfs.h>
#include <linux/sched.h>
#include <linux/nsproxy.h>
@@ -30,21 +30,22 @@
#include "internal.h"
static void afs_i_init_once(void *foo);
-static struct dentry *afs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data);
static void afs_kill_super(struct super_block *sb);
static struct inode *afs_alloc_inode(struct super_block *sb);
static void afs_destroy_inode(struct inode *inode);
static int afs_statfs(struct dentry *dentry, struct kstatfs *buf);
static int afs_show_devname(struct seq_file *m, struct dentry *root);
static int afs_show_options(struct seq_file *m, struct dentry *root);
+static int afs_init_fs_context(struct fs_context *fc);
+static const struct fs_parameter_description afs_fs_parameters;
struct file_system_type afs_fs_type = {
- .owner = THIS_MODULE,
- .name = "afs",
- .mount = afs_mount,
- .kill_sb = afs_kill_super,
- .fs_flags = 0,
+ .owner = THIS_MODULE,
+ .name = "afs",
+ .init_fs_context = afs_init_fs_context,
+ .parameters = &afs_fs_parameters,
+ .kill_sb = afs_kill_super,
+ .fs_flags = 0,
};
MODULE_ALIAS_FS("afs");
@@ -63,22 +64,22 @@ static const struct super_operations afs_super_ops = {
static struct kmem_cache *afs_inode_cachep;
static atomic_t afs_count_active_inodes;
-enum {
- afs_no_opt,
- afs_opt_cell,
- afs_opt_dyn,
- afs_opt_rwpath,
- afs_opt_vol,
- afs_opt_autocell,
+enum afs_param {
+ Opt_autocell,
+ Opt_dyn,
+ Opt_source,
};
-static const match_table_t afs_options_list = {
- { afs_opt_cell, "cell=%s" },
- { afs_opt_dyn, "dyn" },
- { afs_opt_rwpath, "rwpath" },
- { afs_opt_vol, "vol=%s" },
- { afs_opt_autocell, "autocell" },
- { afs_no_opt, NULL },
+static const struct fs_parameter_spec afs_param_specs[] = {
+ fsparam_flag ("autocell", Opt_autocell),
+ fsparam_flag ("dyn", Opt_dyn),
+ fsparam_string("source", Opt_source),
+ {}
+};
+
+static const struct fs_parameter_description afs_fs_parameters = {
+ .name = "kAFS",
+ .specs = afs_param_specs,
};
/*
@@ -190,84 +191,23 @@ static int afs_show_options(struct seq_file *m, struct dentry *root)
}
/*
- * parse the mount options
- * - this function has been shamelessly adapted from the ext3 fs which
- * shamelessly adapted it from the msdos fs
- */
-static int afs_parse_options(struct afs_mount_params *params,
- char *options, const char **devname)
-{
- struct afs_cell *cell;
- substring_t args[MAX_OPT_ARGS];
- char *p;
- int token;
-
- _enter("%s", options);
-
- options[PAGE_SIZE - 1] = 0;
-
- while ((p = strsep(&options, ","))) {
- if (!*p)
- continue;
-
- token = match_token(p, afs_options_list, args);
- switch (token) {
- case afs_opt_cell:
- rcu_read_lock();
- cell = afs_lookup_cell_rcu(params->net,
- args[0].from,
- args[0].to - args[0].from);
- rcu_read_unlock();
- if (IS_ERR(cell))
- return PTR_ERR(cell);
- afs_put_cell(params->net, params->cell);
- params->cell = cell;
- break;
-
- case afs_opt_rwpath:
- params->rwpath = true;
- break;
-
- case afs_opt_vol:
- *devname = args[0].from;
- break;
-
- case afs_opt_autocell:
- params->autocell = true;
- break;
-
- case afs_opt_dyn:
- params->dyn_root = true;
- break;
-
- default:
- printk(KERN_ERR "kAFS:"
- " Unknown or invalid mount option: '%s'\n", p);
- return -EINVAL;
- }
- }
-
- _leave(" = 0");
- return 0;
-}
-
-/*
- * parse a device name to get cell name, volume name, volume type and R/W
- * selector
- * - this can be one of the following:
+ * Parse the source name to get cell name, volume name, volume type and R/W
+ * selector.
+ *
+ * This can be one of the following:
* "%[cell:]volume[.]" R/W volume
- * "#[cell:]volume[.]" R/O or R/W volume (rwpath=0),
- * or R/W (rwpath=1) volume
+ * "#[cell:]volume[.]" R/O or R/W volume (R/O parent),
+ * or R/W (R/W parent) volume
* "%[cell:]volume.readonly" R/O volume
* "#[cell:]volume.readonly" R/O volume
* "%[cell:]volume.backup" Backup volume
* "#[cell:]volume.backup" Backup volume
*/
-static int afs_parse_device_name(struct afs_mount_params *params,
- const char *name)
+static int afs_parse_source(struct fs_context *fc, struct fs_parameter *param)
{
+ struct afs_fs_context *ctx = fc->fs_private;
struct afs_cell *cell;
- const char *cellname, *suffix;
+ const char *cellname, *suffix, *name = param->string;
int cellnamesz;
_enter(",%s", name);
@@ -278,69 +218,149 @@ static int afs_parse_device_name(struct afs_mount_params *params,
}
if ((name[0] != '%' && name[0] != '#') || !name[1]) {
+ /* To use dynroot, we don't want to have to provide a source */
+ if (strcmp(name, "none") == 0) {
+ ctx->no_cell = true;
+ return 0;
+ }
printk(KERN_ERR "kAFS: unparsable volume name\n");
return -EINVAL;
}
/* determine the type of volume we're looking for */
- params->type = AFSVL_ROVOL;
- params->force = false;
- if (params->rwpath || name[0] == '%') {
- params->type = AFSVL_RWVOL;
- params->force = true;
+ if (name[0] == '%') {
+ ctx->type = AFSVL_RWVOL;
+ ctx->force = true;
}
name++;
/* split the cell name out if there is one */
- params->volname = strchr(name, ':');
- if (params->volname) {
+ ctx->volname = strchr(name, ':');
+ if (ctx->volname) {
cellname = name;
- cellnamesz = params->volname - name;
- params->volname++;
+ cellnamesz = ctx->volname - name;
+ ctx->volname++;
} else {
- params->volname = name;
+ ctx->volname = name;
cellname = NULL;
cellnamesz = 0;
}
/* the volume type is further affected by a possible suffix */
- suffix = strrchr(params->volname, '.');
+ suffix = strrchr(ctx->volname, '.');
if (suffix) {
if (strcmp(suffix, ".readonly") == 0) {
- params->type = AFSVL_ROVOL;
- params->force = true;
+ ctx->type = AFSVL_ROVOL;
+ ctx->force = true;
} else if (strcmp(suffix, ".backup") == 0) {
- params->type = AFSVL_BACKVOL;
- params->force = true;
+ ctx->type = AFSVL_BACKVOL;
+ ctx->force = true;
} else if (suffix[1] == 0) {
} else {
suffix = NULL;
}
}
- params->volnamesz = suffix ?
- suffix - params->volname : strlen(params->volname);
+ ctx->volnamesz = suffix ?
+ suffix - ctx->volname : strlen(ctx->volname);
_debug("cell %*.*s [%p]",
- cellnamesz, cellnamesz, cellname ?: "", params->cell);
+ cellnamesz, cellnamesz, cellname ?: "", ctx->cell);
/* lookup the cell record */
- if (cellname || !params->cell) {
- cell = afs_lookup_cell(params->net, cellname, cellnamesz,
+ if (cellname) {
+ cell = afs_lookup_cell(ctx->net, cellname, cellnamesz,
NULL, false);
if (IS_ERR(cell)) {
- printk(KERN_ERR "kAFS: unable to lookup cell '%*.*s'\n",
+ pr_err("kAFS: unable to lookup cell '%*.*s'\n",
cellnamesz, cellnamesz, cellname ?: "");
return PTR_ERR(cell);
}
- afs_put_cell(params->net, params->cell);
- params->cell = cell;
+ afs_put_cell(ctx->net, ctx->cell);
+ ctx->cell = cell;
}
_debug("CELL:%s [%p] VOLUME:%*.*s SUFFIX:%s TYPE:%d%s",
- params->cell->name, params->cell,
- params->volnamesz, params->volnamesz, params->volname,
- suffix ?: "-", params->type, params->force ? " FORCE" : "");
+ ctx->cell->name, ctx->cell,
+ ctx->volnamesz, ctx->volnamesz, ctx->volname,
+ suffix ?: "-", ctx->type, ctx->force ? " FORCE" : "");
+
+ fc->source = param->string;
+ param->string = NULL;
+ return 0;
+}
+
+/*
+ * Parse a single mount parameter.
+ */
+static int afs_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct fs_parse_result result;
+ struct afs_fs_context *ctx = fc->fs_private;
+ int opt;
+
+ opt = fs_parse(fc, &afs_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_source:
+ return afs_parse_source(fc, param);
+
+ case Opt_autocell:
+ ctx->autocell = true;
+ break;
+
+ case Opt_dyn:
+ ctx->dyn_root = true;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * Validate the options, get the cell key and look up the volume.
+ */
+static int afs_validate_fc(struct fs_context *fc)
+{
+ struct afs_fs_context *ctx = fc->fs_private;
+ struct afs_volume *volume;
+ struct key *key;
+
+ if (!ctx->dyn_root) {
+ if (ctx->no_cell) {
+ pr_warn("kAFS: Can only specify source 'none' with -o dyn\n");
+ return -EINVAL;
+ }
+
+ if (!ctx->cell) {
+ pr_warn("kAFS: No cell specified\n");
+ return -EDESTADDRREQ;
+ }
+
+ /* We try to do the mount securely. */
+ key = afs_request_key(ctx->cell);
+ if (IS_ERR(key))
+ return PTR_ERR(key);
+
+ ctx->key = key;
+
+ if (ctx->volume) {
+ afs_put_volume(ctx->cell, ctx->volume);
+ ctx->volume = NULL;
+ }
+
+ volume = afs_create_volume(ctx);
+ if (IS_ERR(volume))
+ return PTR_ERR(volume);
+
+ ctx->volume = volume;
+ }
return 0;
}
@@ -348,39 +368,34 @@ static int afs_parse_device_name(struct afs_mount_params *params,
/*
* check a superblock to see if it's the one we're looking for
*/
-static int afs_test_super(struct super_block *sb, void *data)
+static int afs_test_super(struct super_block *sb, struct fs_context *fc)
{
- struct afs_super_info *as1 = data;
+ struct afs_fs_context *ctx = fc->fs_private;
struct afs_super_info *as = AFS_FS_S(sb);
- return (as->net_ns == as1->net_ns &&
+ return (as->net_ns == fc->net_ns &&
as->volume &&
- as->volume->vid == as1->volume->vid &&
+ as->volume->vid == ctx->volume->vid &&
!as->dyn_root);
}
-static int afs_dynroot_test_super(struct super_block *sb, void *data)
+static int afs_dynroot_test_super(struct super_block *sb, struct fs_context *fc)
{
- struct afs_super_info *as1 = data;
struct afs_super_info *as = AFS_FS_S(sb);
- return (as->net_ns == as1->net_ns &&
+ return (as->net_ns == fc->net_ns &&
as->dyn_root);
}
-static int afs_set_super(struct super_block *sb, void *data)
+static int afs_set_super(struct super_block *sb, struct fs_context *fc)
{
- struct afs_super_info *as = data;
-
- sb->s_fs_info = as;
return set_anon_super(sb, NULL);
}
/*
* fill in the superblock
*/
-static int afs_fill_super(struct super_block *sb,
- struct afs_mount_params *params)
+static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx)
{
struct afs_super_info *as = AFS_FS_S(sb);
struct afs_fid fid;
@@ -399,7 +414,7 @@ static int afs_fill_super(struct super_block *sb,
ret = super_setup_bdi(sb);
if (ret)
return ret;
- sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
/* allocate the root inode and dentry */
if (as->dyn_root) {
@@ -412,13 +427,13 @@ static int afs_fill_super(struct super_block *sb,
fid.vnode = 1;
fid.vnode_hi = 0;
fid.unique = 1;
- inode = afs_iget(sb, params->key, &fid, NULL, NULL, NULL);
+ inode = afs_iget(sb, ctx->key, &fid, NULL, NULL, NULL);
}
if (IS_ERR(inode))
return PTR_ERR(inode);
- if (params->autocell || params->dyn_root)
+ if (ctx->autocell || as->dyn_root)
set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags);
ret = -ENOMEM;
@@ -443,17 +458,20 @@ error:
return ret;
}
-static struct afs_super_info *afs_alloc_sbi(struct afs_mount_params *params)
+static struct afs_super_info *afs_alloc_sbi(struct fs_context *fc)
{
+ struct afs_fs_context *ctx = fc->fs_private;
struct afs_super_info *as;
as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL);
if (as) {
- as->net_ns = get_net(params->net_ns);
- if (params->dyn_root)
+ as->net_ns = get_net(fc->net_ns);
+ if (ctx->dyn_root) {
as->dyn_root = true;
- else
- as->cell = afs_get_cell(params->cell);
+ } else {
+ as->cell = afs_get_cell(ctx->cell);
+ as->volume = __afs_get_volume(ctx->volume);
+ }
}
return as;
}
@@ -475,7 +493,7 @@ static void afs_kill_super(struct super_block *sb)
if (as->dyn_root)
afs_dynroot_depopulate(sb);
-
+
/* Clear the callback interests (which will do ilookup5) before
* deactivating the superblock.
*/
@@ -488,111 +506,103 @@ static void afs_kill_super(struct super_block *sb)
}
/*
- * get an AFS superblock
+ * Get an AFS superblock and root directory.
*/
-static struct dentry *afs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *options)
+static int afs_get_tree(struct fs_context *fc)
{
- struct afs_mount_params params;
+ struct afs_fs_context *ctx = fc->fs_private;
struct super_block *sb;
- struct afs_volume *candidate;
- struct key *key;
struct afs_super_info *as;
int ret;
- _enter(",,%s,%p", dev_name, options);
-
- memset(&params, 0, sizeof(params));
-
- ret = -EINVAL;
- if (current->nsproxy->net_ns != &init_net)
+ ret = afs_validate_fc(fc);
+ if (ret)
goto error;
- params.net_ns = current->nsproxy->net_ns;
- params.net = afs_net(params.net_ns);
-
- /* parse the options and device name */
- if (options) {
- ret = afs_parse_options(&params, options, &dev_name);
- if (ret < 0)
- goto error;
- }
-
- if (!params.dyn_root) {
- ret = afs_parse_device_name(&params, dev_name);
- if (ret < 0)
- goto error;
- /* try and do the mount securely */
- key = afs_request_key(params.cell);
- if (IS_ERR(key)) {
- _leave(" = %ld [key]", PTR_ERR(key));
- ret = PTR_ERR(key);
- goto error;
- }
- params.key = key;
- }
+ _enter("");
/* allocate a superblock info record */
ret = -ENOMEM;
- as = afs_alloc_sbi(&params);
+ as = afs_alloc_sbi(fc);
if (!as)
- goto error_key;
-
- if (!params.dyn_root) {
- /* Assume we're going to need a volume record; at the very
- * least we can use it to update the volume record if we have
- * one already. This checks that the volume exists within the
- * cell.
- */
- candidate = afs_create_volume(&params);
- if (IS_ERR(candidate)) {
- ret = PTR_ERR(candidate);
- goto error_as;
- }
-
- as->volume = candidate;
- }
+ goto error;
+ fc->s_fs_info = as;
/* allocate a deviceless superblock */
- sb = sget(fs_type,
- as->dyn_root ? afs_dynroot_test_super : afs_test_super,
- afs_set_super, flags, as);
+ sb = sget_fc(fc,
+ as->dyn_root ? afs_dynroot_test_super : afs_test_super,
+ afs_set_super);
if (IS_ERR(sb)) {
ret = PTR_ERR(sb);
- goto error_as;
+ goto error;
}
if (!sb->s_root) {
/* initial superblock/root creation */
_debug("create");
- ret = afs_fill_super(sb, &params);
+ ret = afs_fill_super(sb, ctx);
if (ret < 0)
goto error_sb;
- as = NULL;
sb->s_flags |= SB_ACTIVE;
} else {
_debug("reuse");
ASSERTCMP(sb->s_flags, &, SB_ACTIVE);
- afs_destroy_sbi(as);
- as = NULL;
}
- afs_put_cell(params.net, params.cell);
- key_put(params.key);
+ fc->root = dget(sb->s_root);
_leave(" = 0 [%p]", sb);
- return dget(sb->s_root);
+ return 0;
error_sb:
deactivate_locked_super(sb);
- goto error_key;
-error_as:
- afs_destroy_sbi(as);
-error_key:
- key_put(params.key);
error:
- afs_put_cell(params.net, params.cell);
_leave(" = %d", ret);
- return ERR_PTR(ret);
+ return ret;
+}
+
+static void afs_free_fc(struct fs_context *fc)
+{
+ struct afs_fs_context *ctx = fc->fs_private;
+
+ afs_destroy_sbi(fc->s_fs_info);
+ afs_put_volume(ctx->cell, ctx->volume);
+ afs_put_cell(ctx->net, ctx->cell);
+ key_put(ctx->key);
+ kfree(ctx);
+}
+
+static const struct fs_context_operations afs_context_ops = {
+ .free = afs_free_fc,
+ .parse_param = afs_parse_param,
+ .get_tree = afs_get_tree,
+};
+
+/*
+ * Set up the filesystem mount context.
+ */
+static int afs_init_fs_context(struct fs_context *fc)
+{
+ struct afs_fs_context *ctx;
+ struct afs_cell *cell;
+
+ ctx = kzalloc(sizeof(struct afs_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->type = AFSVL_ROVOL;
+ ctx->net = afs_net(fc->net_ns);
+
+ /* Default to the workstation cell. */
+ rcu_read_lock();
+ cell = afs_lookup_cell_rcu(ctx->net, NULL, 0);
+ rcu_read_unlock();
+ if (IS_ERR(cell))
+ cell = NULL;
+ ctx->cell = cell;
+
+ fc->fs_private = ctx;
+ fc->ops = &afs_context_ops;
+ return 0;
}
/*
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 00975ed3640f..f6eba2def0a1 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -21,7 +21,7 @@ static const char *const afs_voltypes[] = { "R/W", "R/O", "BAK" };
/*
* Allocate a volume record and load it up from a vldb record.
*/
-static struct afs_volume *afs_alloc_volume(struct afs_mount_params *params,
+static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
struct afs_vldb_entry *vldb,
unsigned long type_mask)
{
@@ -113,7 +113,7 @@ static struct afs_vldb_entry *afs_vl_lookup_vldb(struct afs_cell *cell,
* - Rule 3: If parent volume is R/W, then only mount R/W volume unless
* explicitly told otherwise
*/
-struct afs_volume *afs_create_volume(struct afs_mount_params *params)
+struct afs_volume *afs_create_volume(struct afs_fs_context *params)
{
struct afs_vldb_entry *vldb;
struct afs_volume *volume;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 129d26226e70..b3642367a595 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1210,6 +1210,8 @@ enum {
* Set for the subvolume tree owning the reloc tree.
*/
BTRFS_ROOT_DEAD_RELOC_TREE,
+ /* Mark dead root stored on device whose cleanup needs to be resumed */
+ BTRFS_ROOT_DEAD_TREE,
};
/*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f0cdb53f3e2d..6fe9197f6ee4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2958,7 +2958,7 @@ int open_ctree(struct super_block *sb,
sb->s_bdi->congested_fn = btrfs_congested_fn;
sb->s_bdi->congested_data = fs_info;
sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
- sb->s_bdi->ra_pages = VM_MAX_READAHEAD * SZ_1K / PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 994f0cc41799..1d49694e6ae3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -8764,6 +8764,8 @@ struct walk_control {
u64 refs[BTRFS_MAX_LEVEL];
u64 flags[BTRFS_MAX_LEVEL];
struct btrfs_key update_progress;
+ struct btrfs_key drop_progress;
+ int drop_level;
int stage;
int level;
int shared_level;
@@ -8771,6 +8773,7 @@ struct walk_control {
int keep_locks;
int reada_slot;
int reada_count;
+ int restarted;
};
#define DROP_REFERENCE 1
@@ -8934,6 +8937,33 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
}
/*
+ * This is used to verify a ref exists for this root to deal with a bug where we
+ * would have a drop_progress key that hadn't been updated properly.
+ */
+static int check_ref_exists(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr, u64 parent,
+ int level)
+{
+ struct btrfs_path *path;
+ struct btrfs_extent_inline_ref *iref;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = lookup_extent_backref(trans, path, &iref, bytenr,
+ root->fs_info->nodesize, parent,
+ root->root_key.objectid, level, 0);
+ btrfs_free_path(path);
+ if (ret == -ENOENT)
+ return 0;
+ if (ret < 0)
+ return ret;
+ return 1;
+}
+
+/*
* helper to process tree block pointer.
*
* when wc->stage == DROP_REFERENCE, this function checks
@@ -9088,6 +9118,23 @@ skip:
}
/*
+ * If we had a drop_progress we need to verify the refs are set
+ * as expected. If we find our ref then we know that from here
+ * on out everything should be correct, and we can clear the
+ * ->restarted flag.
+ */
+ if (wc->restarted) {
+ ret = check_ref_exists(trans, root, bytenr, parent,
+ level - 1);
+ if (ret < 0)
+ goto out_unlock;
+ if (ret == 0)
+ goto no_delete;
+ ret = 0;
+ wc->restarted = 0;
+ }
+
+ /*
* Reloc tree doesn't contribute to qgroup numbers, and we have
* already accounted them at merge time (replace_path),
* thus we could skip expensive subtree trace here.
@@ -9102,13 +9149,23 @@ skip:
ret);
}
}
+
+ /*
+ * We need to update the next key in our walk control so we can
+ * update the drop_progress key accordingly. We don't care if
+ * find_next_key doesn't find a key because that means we're at
+ * the end and are going to clean up now.
+ */
+ wc->drop_level = level;
+ find_next_key(path, level, &wc->drop_progress);
+
ret = btrfs_free_extent(trans, root, bytenr, fs_info->nodesize,
parent, root->root_key.objectid,
level - 1, 0);
if (ret)
goto out_unlock;
}
-
+no_delete:
*lookup_info = 1;
ret = 1;
@@ -9425,6 +9482,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
}
+ wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
wc->level = level;
wc->shared_level = -1;
wc->stage = DROP_REFERENCE;
@@ -9452,12 +9510,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
if (wc->stage == DROP_REFERENCE) {
- level = wc->level;
- btrfs_node_key(path->nodes[level],
- &root_item->drop_progress,
- path->slots[level]);
- root_item->drop_level = level;
- }
+ wc->drop_level = wc->level;
+ btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
+ &wc->drop_progress,
+ path->slots[wc->drop_level]);
+ }
+ btrfs_cpu_key_to_disk(&root_item->drop_progress,
+ &wc->drop_progress);
+ root_item->drop_level = wc->drop_level;
BUG_ON(wc->level == 0);
if (btrfs_should_end_transaction(trans) ||
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ab705183d749..ca8b8e785cf3 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2995,11 +2995,11 @@ static int __do_readpage(struct extent_io_tree *tree,
*/
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
prev_em_start && *prev_em_start != (u64)-1 &&
- *prev_em_start != em->orig_start)
+ *prev_em_start != em->start)
force_bio_submit = true;
if (prev_em_start)
- *prev_em_start = em->orig_start;
+ *prev_em_start = em->start;
free_extent_map(em);
em = NULL;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 494f0f10d70e..ec2d8919e7fb 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3207,21 +3207,6 @@ out:
return ret;
}
-static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2)
-{
- inode_unlock(inode1);
- inode_unlock(inode2);
-}
-
-static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2)
-{
- if (inode1 < inode2)
- swap(inode1, inode2);
-
- inode_lock_nested(inode1, I_MUTEX_PARENT);
- inode_lock_nested(inode2, I_MUTEX_CHILD);
-}
-
static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
struct inode *inode2, u64 loff2, u64 len)
{
@@ -3956,7 +3941,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
if (same_inode)
inode_lock(inode_in);
else
- btrfs_double_inode_lock(inode_in, inode_out);
+ lock_two_nondirectories(inode_in, inode_out);
/* don't make the dst file partly checksummed */
if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
@@ -4013,7 +3998,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
if (same_inode)
inode_unlock(inode_in);
else
- btrfs_double_inode_unlock(inode_in, inode_out);
+ unlock_two_nondirectories(inode_in, inode_out);
return ret;
}
@@ -4043,7 +4028,7 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
if (same_inode)
inode_unlock(src_inode);
else
- btrfs_double_inode_unlock(src_inode, dst_inode);
+ unlock_two_nondirectories(src_inode, dst_inode);
return ret < 0 ? ret : len;
}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index c1cd5558a646..eb680b715dd6 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -894,6 +894,12 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
if (fs_info->quota_root)
goto out;
+ fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
+ if (!fs_info->qgroup_ulist) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
/*
* 1 for quota root item
* 1 for BTRFS_QGROUP_STATUS item
@@ -909,13 +915,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
goto out;
}
- fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
- if (!fs_info->qgroup_ulist) {
- ret = -ENOMEM;
- btrfs_abort_transaction(trans, ret);
- goto out;
- }
-
/*
* initially create the quota tree
*/
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 0d2b957ca3a3..893d12fbfda0 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -263,8 +263,10 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
if (root) {
WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
&root->state));
- if (btrfs_root_refs(&root->root_item) == 0)
+ if (btrfs_root_refs(&root->root_item) == 0) {
+ set_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
btrfs_add_dead_root(root);
+ }
continue;
}
@@ -310,8 +312,10 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
break;
}
- if (btrfs_root_refs(&root->root_item) == 0)
+ if (btrfs_root_refs(&root->root_item) == 0) {
+ set_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
btrfs_add_dead_root(root);
+ }
}
btrfs_free_path(path);
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 3e418a3aeb11..6b9e29d050f3 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -195,8 +195,7 @@ static void zstd_cleanup_workspace_manager(void)
struct workspace *workspace;
int i;
- del_timer(&wsm.timer);
-
+ spin_lock(&wsm.lock);
for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) {
while (!list_empty(&wsm.idle_ws[i])) {
workspace = container_of(wsm.idle_ws[i].next,
@@ -206,6 +205,9 @@ static void zstd_cleanup_workspace_manager(void)
wsm.ops->free_workspace(&workspace->list);
}
}
+ spin_unlock(&wsm.lock);
+
+ del_timer_sync(&wsm.timer);
}
/*
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index bba28a5034ba..36a8dc699448 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -148,11 +148,17 @@ void ceph_caps_finalize(struct ceph_mds_client *mdsc)
spin_unlock(&mdsc->caps_list_lock);
}
-void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
+void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc,
+ struct ceph_mount_options *fsopt)
{
spin_lock(&mdsc->caps_list_lock);
- mdsc->caps_min_count += delta;
- BUG_ON(mdsc->caps_min_count < 0);
+ mdsc->caps_min_count = fsopt->max_readdir;
+ if (mdsc->caps_min_count < 1024)
+ mdsc->caps_min_count = 1024;
+ mdsc->caps_use_max = fsopt->caps_max;
+ if (mdsc->caps_use_max > 0 &&
+ mdsc->caps_use_max < mdsc->caps_min_count)
+ mdsc->caps_use_max = mdsc->caps_min_count;
spin_unlock(&mdsc->caps_list_lock);
}
@@ -272,6 +278,7 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
if (!err) {
BUG_ON(have + alloc != need);
ctx->count = need;
+ ctx->used = 0;
}
spin_lock(&mdsc->caps_list_lock);
@@ -295,13 +302,24 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
}
void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
- struct ceph_cap_reservation *ctx)
+ struct ceph_cap_reservation *ctx)
{
+ bool reclaim = false;
+ if (!ctx->count)
+ return;
+
dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
spin_lock(&mdsc->caps_list_lock);
__ceph_unreserve_caps(mdsc, ctx->count);
ctx->count = 0;
+
+ if (mdsc->caps_use_max > 0 &&
+ mdsc->caps_use_count > mdsc->caps_use_max)
+ reclaim = true;
spin_unlock(&mdsc->caps_list_lock);
+
+ if (reclaim)
+ ceph_reclaim_caps_nr(mdsc, ctx->used);
}
struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
@@ -346,6 +364,7 @@ struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
BUG_ON(list_empty(&mdsc->caps_list));
ctx->count--;
+ ctx->used++;
mdsc->caps_reserve_count--;
mdsc->caps_use_count++;
@@ -500,12 +519,12 @@ static void __insert_cap_node(struct ceph_inode_info *ci,
static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
- struct ceph_mount_options *ma = mdsc->fsc->mount_options;
+ struct ceph_mount_options *opt = mdsc->fsc->mount_options;
ci->i_hold_caps_min = round_jiffies(jiffies +
- ma->caps_wanted_delay_min * HZ);
+ opt->caps_wanted_delay_min * HZ);
ci->i_hold_caps_max = round_jiffies(jiffies +
- ma->caps_wanted_delay_max * HZ);
+ opt->caps_wanted_delay_max * HZ);
dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
}
@@ -657,6 +676,10 @@ void ceph_add_cap(struct inode *inode,
session->s_nr_caps++;
spin_unlock(&session->s_cap_lock);
} else {
+ spin_lock(&session->s_cap_lock);
+ list_move_tail(&cap->session_caps, &session->s_caps);
+ spin_unlock(&session->s_cap_lock);
+
if (cap->cap_gen < session->s_cap_gen)
cap->issued = cap->implemented = CEPH_CAP_PIN;
@@ -1081,9 +1104,7 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
(!session->s_cap_reconnect || cap->cap_gen == session->s_cap_gen)) {
cap->queue_release = 1;
if (removed) {
- list_add_tail(&cap->session_caps,
- &session->s_cap_releases);
- session->s_num_cap_releases++;
+ __ceph_queue_cap_release(session, cap);
removed = 0;
}
} else {
@@ -1245,7 +1266,7 @@ static int send_cap_msg(struct cap_msg_args *arg)
* Queue cap releases when an inode is dropped from our cache. Since
* inode is about to be destroyed, there is no need for i_ceph_lock.
*/
-void ceph_queue_caps_release(struct inode *inode)
+void __ceph_remove_caps(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct rb_node *p;
@@ -2393,6 +2414,12 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
if ((cap->issued & ci->i_flushing_caps) !=
ci->i_flushing_caps) {
ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+ /* encode_caps_cb() also will reset these sequence
+ * numbers. make sure sequence numbers in cap flush
+ * message match later reconnect message */
+ cap->seq = 0;
+ cap->issue_seq = 0;
+ cap->mseq = 0;
__kick_flushing_caps(mdsc, session, ci,
oldest_flush_tid);
} else {
@@ -3880,12 +3907,10 @@ void ceph_handle_caps(struct ceph_mds_session *session,
cap->seq = seq;
cap->issue_seq = seq;
spin_lock(&session->s_cap_lock);
- list_add_tail(&cap->session_caps,
- &session->s_cap_releases);
- session->s_num_cap_releases++;
+ __ceph_queue_cap_release(session, cap);
spin_unlock(&session->s_cap_lock);
}
- goto flush_cap_releases;
+ goto done;
}
/* these will work even if we don't have a cap yet */
@@ -3955,7 +3980,12 @@ void ceph_handle_caps(struct ceph_mds_session *session,
ceph_cap_op_name(op));
}
- goto done;
+done:
+ mutex_unlock(&session->s_mutex);
+done_unlocked:
+ iput(inode);
+ ceph_put_string(extra_info.pool_ns);
+ return;
flush_cap_releases:
/*
@@ -3963,14 +3993,8 @@ flush_cap_releases:
* along for the mds (who clearly thinks we still have this
* cap).
*/
- ceph_send_cap_releases(mdsc, session);
-
-done:
- mutex_unlock(&session->s_mutex);
-done_unlocked:
- iput(inode);
- ceph_put_string(extra_info.pool_ns);
- return;
+ ceph_flush_cap_releases(mdsc, session);
+ goto done;
bad:
pr_err("ceph_handle_caps: corrupt message\n");
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index abdf98deeec4..98365e74cb4a 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -139,23 +139,6 @@ static int caps_show(struct seq_file *s, void *p)
return 0;
}
-static int dentry_lru_show(struct seq_file *s, void *ptr)
-{
- struct ceph_fs_client *fsc = s->private;
- struct ceph_mds_client *mdsc = fsc->mdsc;
- struct ceph_dentry_info *di;
-
- spin_lock(&mdsc->dentry_lru_lock);
- list_for_each_entry(di, &mdsc->dentry_lru, lru) {
- struct dentry *dentry = di->dentry;
- seq_printf(s, "%p %p\t%pd\n",
- di, dentry, dentry);
- }
- spin_unlock(&mdsc->dentry_lru_lock);
-
- return 0;
-}
-
static int mds_sessions_show(struct seq_file *s, void *ptr)
{
struct ceph_fs_client *fsc = s->private;
@@ -195,7 +178,6 @@ static int mds_sessions_show(struct seq_file *s, void *ptr)
CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
CEPH_DEFINE_SHOW_FUNC(mdsc_show)
CEPH_DEFINE_SHOW_FUNC(caps_show)
-CEPH_DEFINE_SHOW_FUNC(dentry_lru_show)
CEPH_DEFINE_SHOW_FUNC(mds_sessions_show)
@@ -231,7 +213,6 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
debugfs_remove(fsc->debugfs_mds_sessions);
debugfs_remove(fsc->debugfs_caps);
debugfs_remove(fsc->debugfs_mdsc);
- debugfs_remove(fsc->debugfs_dentry_lru);
}
int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
@@ -291,14 +272,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
if (!fsc->debugfs_caps)
goto out;
- fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
- 0400,
- fsc->client->debugfs_dir,
- fsc,
- &dentry_lru_show_fops);
- if (!fsc->debugfs_dentry_lru)
- goto out;
-
return 0;
out:
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 82928cea0209..a8f429882249 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -29,6 +29,9 @@
const struct dentry_operations ceph_dentry_ops;
+static bool __dentry_lease_is_valid(struct ceph_dentry_info *di);
+static int __dir_lease_try_check(const struct dentry *dentry);
+
/*
* Initialize ceph dentry state.
*/
@@ -44,7 +47,7 @@ static int ceph_d_init(struct dentry *dentry)
di->lease_session = NULL;
di->time = jiffies;
dentry->d_fsdata = di;
- ceph_dentry_lru_add(dentry);
+ INIT_LIST_HEAD(&di->lease_list);
return 0;
}
@@ -241,6 +244,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
goto out;
}
if (fpos_cmp(ctx->pos, di->offset) <= 0) {
+ __ceph_dentry_dir_lease_touch(di);
emit_dentry = true;
}
spin_unlock(&dentry->d_lock);
@@ -1125,13 +1129,277 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
}
/*
+ * Move dentry to tail of mdsc->dentry_leases list when lease is updated.
+ * Leases at front of the list will expire first. (Assume all leases have
+ * similar duration)
+ *
+ * Called under dentry->d_lock.
+ */
+void __ceph_dentry_lease_touch(struct ceph_dentry_info *di)
+{
+ struct dentry *dn = di->dentry;
+ struct ceph_mds_client *mdsc;
+
+ dout("dentry_lease_touch %p %p '%pd'\n", di, dn, dn);
+
+ di->flags |= CEPH_DENTRY_LEASE_LIST;
+ if (di->flags & CEPH_DENTRY_SHRINK_LIST) {
+ di->flags |= CEPH_DENTRY_REFERENCED;
+ return;
+ }
+
+ mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+ spin_lock(&mdsc->dentry_list_lock);
+ list_move_tail(&di->lease_list, &mdsc->dentry_leases);
+ spin_unlock(&mdsc->dentry_list_lock);
+}
+
+static void __dentry_dir_lease_touch(struct ceph_mds_client* mdsc,
+ struct ceph_dentry_info *di)
+{
+ di->flags &= ~(CEPH_DENTRY_LEASE_LIST | CEPH_DENTRY_REFERENCED);
+ di->lease_gen = 0;
+ di->time = jiffies;
+ list_move_tail(&di->lease_list, &mdsc->dentry_dir_leases);
+}
+
+/*
+ * When dir lease is used, add dentry to tail of mdsc->dentry_dir_leases
+ * list if it's not in the list, otherwise set 'referenced' flag.
+ *
+ * Called under dentry->d_lock.
+ */
+void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di)
+{
+ struct dentry *dn = di->dentry;
+ struct ceph_mds_client *mdsc;
+
+ dout("dentry_dir_lease_touch %p %p '%pd' (offset %lld)\n",
+ di, dn, dn, di->offset);
+
+ if (!list_empty(&di->lease_list)) {
+ if (di->flags & CEPH_DENTRY_LEASE_LIST) {
+ /* don't remove dentry from dentry lease list
+ * if its lease is valid */
+ if (__dentry_lease_is_valid(di))
+ return;
+ } else {
+ di->flags |= CEPH_DENTRY_REFERENCED;
+ return;
+ }
+ }
+
+ if (di->flags & CEPH_DENTRY_SHRINK_LIST) {
+ di->flags |= CEPH_DENTRY_REFERENCED;
+ di->flags &= ~CEPH_DENTRY_LEASE_LIST;
+ return;
+ }
+
+ mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+ spin_lock(&mdsc->dentry_list_lock);
+ __dentry_dir_lease_touch(mdsc, di),
+ spin_unlock(&mdsc->dentry_list_lock);
+}
+
+static void __dentry_lease_unlist(struct ceph_dentry_info *di)
+{
+ struct ceph_mds_client *mdsc;
+ if (di->flags & CEPH_DENTRY_SHRINK_LIST)
+ return;
+ if (list_empty(&di->lease_list))
+ return;
+
+ mdsc = ceph_sb_to_client(di->dentry->d_sb)->mdsc;
+ spin_lock(&mdsc->dentry_list_lock);
+ list_del_init(&di->lease_list);
+ spin_unlock(&mdsc->dentry_list_lock);
+}
+
+enum {
+ KEEP = 0,
+ DELETE = 1,
+ TOUCH = 2,
+ STOP = 4,
+};
+
+struct ceph_lease_walk_control {
+ bool dir_lease;
+ bool expire_dir_lease;
+ unsigned long nr_to_scan;
+ unsigned long dir_lease_ttl;
+};
+
+static unsigned long
+__dentry_leases_walk(struct ceph_mds_client *mdsc,
+ struct ceph_lease_walk_control *lwc,
+ int (*check)(struct dentry*, void*))
+{
+ struct ceph_dentry_info *di, *tmp;
+ struct dentry *dentry, *last = NULL;
+ struct list_head* list;
+ LIST_HEAD(dispose);
+ unsigned long freed = 0;
+ int ret = 0;
+
+ list = lwc->dir_lease ? &mdsc->dentry_dir_leases : &mdsc->dentry_leases;
+ spin_lock(&mdsc->dentry_list_lock);
+ list_for_each_entry_safe(di, tmp, list, lease_list) {
+ if (!lwc->nr_to_scan)
+ break;
+ --lwc->nr_to_scan;
+
+ dentry = di->dentry;
+ if (last == dentry)
+ break;
+
+ if (!spin_trylock(&dentry->d_lock))
+ continue;
+
+ if (dentry->d_lockref.count < 0) {
+ list_del_init(&di->lease_list);
+ goto next;
+ }
+
+ ret = check(dentry, lwc);
+ if (ret & TOUCH) {
+ /* move it into tail of dir lease list */
+ __dentry_dir_lease_touch(mdsc, di);
+ if (!last)
+ last = dentry;
+ }
+ if (ret & DELETE) {
+ /* stale lease */
+ di->flags &= ~CEPH_DENTRY_REFERENCED;
+ if (dentry->d_lockref.count > 0) {
+ /* update_dentry_lease() will re-add
+ * it to lease list, or
+ * ceph_d_delete() will return 1 when
+ * last reference is dropped */
+ list_del_init(&di->lease_list);
+ } else {
+ di->flags |= CEPH_DENTRY_SHRINK_LIST;
+ list_move_tail(&di->lease_list, &dispose);
+ dget_dlock(dentry);
+ }
+ }
+next:
+ spin_unlock(&dentry->d_lock);
+ if (ret & STOP)
+ break;
+ }
+ spin_unlock(&mdsc->dentry_list_lock);
+
+ while (!list_empty(&dispose)) {
+ di = list_first_entry(&dispose, struct ceph_dentry_info,
+ lease_list);
+ dentry = di->dentry;
+ spin_lock(&dentry->d_lock);
+
+ list_del_init(&di->lease_list);
+ di->flags &= ~CEPH_DENTRY_SHRINK_LIST;
+ if (di->flags & CEPH_DENTRY_REFERENCED) {
+ spin_lock(&mdsc->dentry_list_lock);
+ if (di->flags & CEPH_DENTRY_LEASE_LIST) {
+ list_add_tail(&di->lease_list,
+ &mdsc->dentry_leases);
+ } else {
+ __dentry_dir_lease_touch(mdsc, di);
+ }
+ spin_unlock(&mdsc->dentry_list_lock);
+ } else {
+ freed++;
+ }
+
+ spin_unlock(&dentry->d_lock);
+ /* ceph_d_delete() does the trick */
+ dput(dentry);
+ }
+ return freed;
+}
+
+static int __dentry_lease_check(struct dentry *dentry, void *arg)
+{
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+ int ret;
+
+ if (__dentry_lease_is_valid(di))
+ return STOP;
+ ret = __dir_lease_try_check(dentry);
+ if (ret == -EBUSY)
+ return KEEP;
+ if (ret > 0)
+ return TOUCH;
+ return DELETE;
+}
+
+static int __dir_lease_check(struct dentry *dentry, void *arg)
+{
+ struct ceph_lease_walk_control *lwc = arg;
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+
+ int ret = __dir_lease_try_check(dentry);
+ if (ret == -EBUSY)
+ return KEEP;
+ if (ret > 0) {
+ if (time_before(jiffies, di->time + lwc->dir_lease_ttl))
+ return STOP;
+ /* Move dentry to tail of dir lease list if we don't want
+ * to delete it. So dentries in the list are checked in a
+ * round robin manner */
+ if (!lwc->expire_dir_lease)
+ return TOUCH;
+ if (dentry->d_lockref.count > 0 ||
+ (di->flags & CEPH_DENTRY_REFERENCED))
+ return TOUCH;
+ /* invalidate dir lease */
+ di->lease_shared_gen = 0;
+ }
+ return DELETE;
+}
+
+int ceph_trim_dentries(struct ceph_mds_client *mdsc)
+{
+ struct ceph_lease_walk_control lwc;
+ unsigned long count;
+ unsigned long freed;
+
+ spin_lock(&mdsc->caps_list_lock);
+ if (mdsc->caps_use_max > 0 &&
+ mdsc->caps_use_count > mdsc->caps_use_max)
+ count = mdsc->caps_use_count - mdsc->caps_use_max;
+ else
+ count = 0;
+ spin_unlock(&mdsc->caps_list_lock);
+
+ lwc.dir_lease = false;
+ lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE * 2;
+ freed = __dentry_leases_walk(mdsc, &lwc, __dentry_lease_check);
+ if (!lwc.nr_to_scan) /* more invalid leases */
+ return -EAGAIN;
+
+ if (lwc.nr_to_scan < CEPH_CAPS_PER_RELEASE)
+ lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE;
+
+ lwc.dir_lease = true;
+ lwc.expire_dir_lease = freed < count;
+ lwc.dir_lease_ttl = mdsc->fsc->mount_options->caps_wanted_delay_max * HZ;
+ freed +=__dentry_leases_walk(mdsc, &lwc, __dir_lease_check);
+ if (!lwc.nr_to_scan) /* more to check */
+ return -EAGAIN;
+
+ return freed > 0 ? 1 : 0;
+}
+
+/*
* Ensure a dentry lease will no longer revalidate.
*/
void ceph_invalidate_dentry_lease(struct dentry *dentry)
{
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
spin_lock(&dentry->d_lock);
- ceph_dentry(dentry)->time = jiffies;
- ceph_dentry(dentry)->lease_shared_gen = 0;
+ di->time = jiffies;
+ di->lease_shared_gen = 0;
+ __dentry_lease_unlist(di);
spin_unlock(&dentry->d_lock);
}
@@ -1139,45 +1407,59 @@ void ceph_invalidate_dentry_lease(struct dentry *dentry)
* Check if dentry lease is valid. If not, delete the lease. Try to
* renew if the least is more than half up.
*/
+static bool __dentry_lease_is_valid(struct ceph_dentry_info *di)
+{
+ struct ceph_mds_session *session;
+
+ if (!di->lease_gen)
+ return false;
+
+ session = di->lease_session;
+ if (session) {
+ u32 gen;
+ unsigned long ttl;
+
+ spin_lock(&session->s_gen_ttl_lock);
+ gen = session->s_cap_gen;
+ ttl = session->s_cap_ttl;
+ spin_unlock(&session->s_gen_ttl_lock);
+
+ if (di->lease_gen == gen &&
+ time_before(jiffies, ttl) &&
+ time_before(jiffies, di->time))
+ return true;
+ }
+ di->lease_gen = 0;
+ return false;
+}
+
static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
struct inode *dir)
{
struct ceph_dentry_info *di;
- struct ceph_mds_session *s;
- int valid = 0;
- u32 gen;
- unsigned long ttl;
struct ceph_mds_session *session = NULL;
u32 seq = 0;
+ int valid = 0;
spin_lock(&dentry->d_lock);
di = ceph_dentry(dentry);
- if (di && di->lease_session) {
- s = di->lease_session;
- spin_lock(&s->s_gen_ttl_lock);
- gen = s->s_cap_gen;
- ttl = s->s_cap_ttl;
- spin_unlock(&s->s_gen_ttl_lock);
+ if (di && __dentry_lease_is_valid(di)) {
+ valid = 1;
- if (di->lease_gen == gen &&
- time_before(jiffies, di->time) &&
- time_before(jiffies, ttl)) {
- valid = 1;
- if (di->lease_renew_after &&
- time_after(jiffies, di->lease_renew_after)) {
- /*
- * We should renew. If we're in RCU walk mode
- * though, we can't do that so just return
- * -ECHILD.
- */
- if (flags & LOOKUP_RCU) {
- valid = -ECHILD;
- } else {
- session = ceph_get_mds_session(s);
- seq = di->lease_seq;
- di->lease_renew_after = 0;
- di->lease_renew_from = jiffies;
- }
+ if (di->lease_renew_after &&
+ time_after(jiffies, di->lease_renew_after)) {
+ /*
+ * We should renew. If we're in RCU walk mode
+ * though, we can't do that so just return
+ * -ECHILD.
+ */
+ if (flags & LOOKUP_RCU) {
+ valid = -ECHILD;
+ } else {
+ session = ceph_get_mds_session(di->lease_session);
+ seq = di->lease_seq;
+ di->lease_renew_after = 0;
+ di->lease_renew_from = jiffies;
}
}
}
@@ -1193,6 +1475,38 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
}
/*
+ * Called under dentry->d_lock.
+ */
+static int __dir_lease_try_check(const struct dentry *dentry)
+{
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+ struct inode *dir;
+ struct ceph_inode_info *ci;
+ int valid = 0;
+
+ if (!di->lease_shared_gen)
+ return 0;
+ if (IS_ROOT(dentry))
+ return 0;
+
+ dir = d_inode(dentry->d_parent);
+ ci = ceph_inode(dir);
+
+ if (spin_trylock(&ci->i_ceph_lock)) {
+ if (atomic_read(&ci->i_shared_gen) == di->lease_shared_gen &&
+ __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 0))
+ valid = 1;
+ spin_unlock(&ci->i_ceph_lock);
+ } else {
+ valid = -EBUSY;
+ }
+
+ if (!valid)
+ di->lease_shared_gen = 0;
+ return valid;
+}
+
+/*
* Check if directory-wide content lease/cap is valid.
*/
static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
@@ -1205,6 +1519,8 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
if (atomic_read(&ci->i_shared_gen) == di->lease_shared_gen)
valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
spin_unlock(&ci->i_ceph_lock);
+ if (valid)
+ __ceph_dentry_dir_lease_touch(di);
dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
dir, (unsigned)atomic_read(&ci->i_shared_gen),
dentry, (unsigned)di->lease_shared_gen, valid);
@@ -1297,11 +1613,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
}
dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
- if (valid) {
- ceph_dentry_lru_touch(dentry);
- } else {
+ if (!valid)
ceph_dir_clear_complete(dir);
- }
if (!(flags & LOOKUP_RCU))
dput(parent);
@@ -1309,6 +1622,31 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
}
/*
+ * Delete unused dentry that doesn't have valid lease
+ *
+ * Called under dentry->d_lock.
+ */
+static int ceph_d_delete(const struct dentry *dentry)
+{
+ struct ceph_dentry_info *di;
+
+ /* won't release caps */
+ if (d_really_is_negative(dentry))
+ return 0;
+ if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP)
+ return 0;
+ /* vaild lease? */
+ di = ceph_dentry(dentry);
+ if (di) {
+ if (__dentry_lease_is_valid(di))
+ return 0;
+ if (__dir_lease_try_check(dentry))
+ return 0;
+ }
+ return 1;
+}
+
+/*
* Release our ceph_dentry_info.
*/
static void ceph_d_release(struct dentry *dentry)
@@ -1316,9 +1654,9 @@ static void ceph_d_release(struct dentry *dentry)
struct ceph_dentry_info *di = ceph_dentry(dentry);
dout("d_release %p\n", dentry);
- ceph_dentry_lru_del(dentry);
spin_lock(&dentry->d_lock);
+ __dentry_lease_unlist(di);
dentry->d_fsdata = NULL;
spin_unlock(&dentry->d_lock);
@@ -1419,49 +1757,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
return size - left;
}
-/*
- * We maintain a private dentry LRU.
- *
- * FIXME: this needs to be changed to a per-mds lru to be useful.
- */
-void ceph_dentry_lru_add(struct dentry *dn)
-{
- struct ceph_dentry_info *di = ceph_dentry(dn);
- struct ceph_mds_client *mdsc;
-
- dout("dentry_lru_add %p %p '%pd'\n", di, dn, dn);
- mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
- spin_lock(&mdsc->dentry_lru_lock);
- list_add_tail(&di->lru, &mdsc->dentry_lru);
- mdsc->num_dentry++;
- spin_unlock(&mdsc->dentry_lru_lock);
-}
-
-void ceph_dentry_lru_touch(struct dentry *dn)
-{
- struct ceph_dentry_info *di = ceph_dentry(dn);
- struct ceph_mds_client *mdsc;
-
- dout("dentry_lru_touch %p %p '%pd' (offset %lld)\n", di, dn, dn,
- di->offset);
- mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
- spin_lock(&mdsc->dentry_lru_lock);
- list_move_tail(&di->lru, &mdsc->dentry_lru);
- spin_unlock(&mdsc->dentry_lru_lock);
-}
-void ceph_dentry_lru_del(struct dentry *dn)
-{
- struct ceph_dentry_info *di = ceph_dentry(dn);
- struct ceph_mds_client *mdsc;
-
- dout("dentry_lru_del %p %p '%pd'\n", di, dn, dn);
- mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
- spin_lock(&mdsc->dentry_lru_lock);
- list_del_init(&di->lru);
- mdsc->num_dentry--;
- spin_unlock(&mdsc->dentry_lru_lock);
-}
/*
* Return name hash for a given dentry. This is dependent on
@@ -1531,6 +1827,7 @@ const struct inode_operations ceph_snapdir_iops = {
const struct dentry_operations ceph_dentry_ops = {
.d_revalidate = ceph_d_revalidate,
+ .d_delete = ceph_d_delete,
.d_release = ceph_d_release,
.d_prune = ceph_d_prune,
.d_init = ceph_d_init,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 189df668b6a0..9f53c3d99304 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -590,7 +590,8 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
* but it will at least behave sensibly when they are
* in sequence.
*/
- ret = filemap_write_and_wait_range(inode->i_mapping, off, off + len);
+ ret = filemap_write_and_wait_range(inode->i_mapping,
+ off, off + len - 1);
if (ret < 0)
return ret;
@@ -929,14 +930,15 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
(write ? "write" : "read"), file, pos, (unsigned)count,
snapc, snapc->seq);
- ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
+ ret = filemap_write_and_wait_range(inode->i_mapping,
+ pos, pos + count - 1);
if (ret < 0)
return ret;
if (write) {
int ret2 = invalidate_inode_pages2_range(inode->i_mapping,
pos >> PAGE_SHIFT,
- (pos + count) >> PAGE_SHIFT);
+ (pos + count - 1) >> PAGE_SHIFT);
if (ret2 < 0)
dout("invalidate_inode_pages2_range returned %d\n", ret2);
@@ -1132,13 +1134,14 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
dout("sync_write on file %p %lld~%u snapc %p seq %lld\n",
file, pos, (unsigned)count, snapc, snapc->seq);
- ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
+ ret = filemap_write_and_wait_range(inode->i_mapping,
+ pos, pos + count - 1);
if (ret < 0)
return ret;
ret = invalidate_inode_pages2_range(inode->i_mapping,
pos >> PAGE_SHIFT,
- (pos + count) >> PAGE_SHIFT);
+ (pos + count - 1) >> PAGE_SHIFT);
if (ret < 0)
dout("invalidate_inode_pages2_range returned %d\n", ret);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 9d1f34d46627..e3346628efe2 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -497,7 +497,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_wrbuffer_ref = 0;
ci->i_wrbuffer_ref_head = 0;
atomic_set(&ci->i_filelock_ref, 0);
- atomic_set(&ci->i_shared_gen, 0);
+ atomic_set(&ci->i_shared_gen, 1);
ci->i_rdcache_gen = 0;
ci->i_rdcache_revoking = 0;
@@ -537,7 +537,7 @@ void ceph_destroy_inode(struct inode *inode)
ceph_fscache_unregister_inode_cookie(ci);
- ceph_queue_caps_release(inode);
+ __ceph_remove_caps(inode);
if (__ceph_has_any_quota(ci))
ceph_adjust_quota_realms_count(inode, false);
@@ -548,17 +548,22 @@ void ceph_destroy_inode(struct inode *inode)
*/
if (ci->i_snap_realm) {
struct ceph_mds_client *mdsc =
- ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
- struct ceph_snap_realm *realm = ci->i_snap_realm;
-
- dout(" dropping residual ref to snap realm %p\n", realm);
- spin_lock(&realm->inodes_with_caps_lock);
- list_del_init(&ci->i_snap_realm_item);
- ci->i_snap_realm = NULL;
- if (realm->ino == ci->i_vino.ino)
- realm->inode = NULL;
- spin_unlock(&realm->inodes_with_caps_lock);
- ceph_put_snap_realm(mdsc, realm);
+ ceph_inode_to_client(inode)->mdsc;
+ if (ceph_snap(inode) == CEPH_NOSNAP) {
+ struct ceph_snap_realm *realm = ci->i_snap_realm;
+ dout(" dropping residual ref to snap realm %p\n",
+ realm);
+ spin_lock(&realm->inodes_with_caps_lock);
+ list_del_init(&ci->i_snap_realm_item);
+ ci->i_snap_realm = NULL;
+ if (realm->ino == ci->i_vino.ino)
+ realm->inode = NULL;
+ spin_unlock(&realm->inodes_with_caps_lock);
+ ceph_put_snap_realm(mdsc, realm);
+ } else {
+ ceph_put_snapid_map(mdsc, ci->i_snapid_map);
+ ci->i_snap_realm = NULL;
+ }
}
kfree(ci->i_symlink);
@@ -776,6 +781,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data,
iinfo->pool_ns_len);
+ if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map)
+ ci->i_snapid_map = ceph_get_snapid_map(mdsc, ceph_snap(inode));
+
spin_lock(&ci->i_ceph_lock);
/*
@@ -869,6 +877,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
ci->i_rbytes = le64_to_cpu(info->rbytes);
ci->i_rfiles = le64_to_cpu(info->rfiles);
ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
+ ci->i_dir_pin = iinfo->dir_pin;
ceph_decode_timespec64(&ci->i_rctime, &info->rctime);
}
}
@@ -899,6 +908,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
case S_IFBLK:
case S_IFCHR:
case S_IFSOCK:
+ inode->i_blkbits = PAGE_SHIFT;
init_special_inode(inode, inode->i_mode, inode->i_rdev);
inode->i_op = &ceph_file_iops;
break;
@@ -1066,9 +1076,10 @@ static void update_dentry_lease(struct dentry *dentry,
goto out_unlock;
di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen);
-
- if (duration == 0)
+ if (duration == 0) {
+ __ceph_dentry_dir_lease_touch(di);
goto out_unlock;
+ }
if (di->lease_gen == session->s_cap_gen &&
time_before(ttl, di->time))
@@ -1079,8 +1090,6 @@ static void update_dentry_lease(struct dentry *dentry,
di->lease_session = NULL;
}
- ceph_dentry_lru_touch(dentry);
-
if (!di->lease_session)
di->lease_session = ceph_get_mds_session(session);
di->lease_gen = session->s_cap_gen;
@@ -1088,6 +1097,8 @@ static void update_dentry_lease(struct dentry *dentry,
di->lease_renew_after = half_ttl;
di->lease_renew_from = 0;
di->time = ttl;
+
+ __ceph_dentry_lease_touch(di);
out_unlock:
spin_unlock(&dentry->d_lock);
if (old_lease_session)
@@ -2259,10 +2270,11 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
if (!err) {
generic_fillattr(inode, stat);
stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
- if (ceph_snap(inode) != CEPH_NOSNAP)
- stat->dev = ceph_snap(inode);
+ if (ceph_snap(inode) == CEPH_NOSNAP)
+ stat->dev = inode->i_sb->s_dev;
else
- stat->dev = 0;
+ stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
+
if (S_ISDIR(inode->i_mode)) {
if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
RBYTES))
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 163fc74bf221..21c33ed048ed 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -20,6 +20,8 @@
#include <linux/ceph/auth.h>
#include <linux/ceph/debugfs.h>
+#define RECONNECT_MAX_SIZE (INT_MAX - PAGE_SIZE)
+
/*
* A cluster of MDS (metadata server) daemons is responsible for
* managing the file system namespace (the directory hierarchy and
@@ -46,13 +48,17 @@
*/
struct ceph_reconnect_state {
- int nr_caps;
+ struct ceph_mds_session *session;
+ int nr_caps, nr_realms;
struct ceph_pagelist *pagelist;
unsigned msg_version;
+ bool allow_multi;
};
static void __wake_requests(struct ceph_mds_client *mdsc,
struct list_head *head);
+static void ceph_cap_release_work(struct work_struct *work);
+static void ceph_cap_reclaim_work(struct work_struct *work);
static const struct ceph_connection_operations mds_con_ops;
@@ -61,6 +67,29 @@ static const struct ceph_connection_operations mds_con_ops;
* mds reply parsing
*/
+static int parse_reply_info_quota(void **p, void *end,
+ struct ceph_mds_reply_info_in *info)
+{
+ u8 struct_v, struct_compat;
+ u32 struct_len;
+
+ ceph_decode_8_safe(p, end, struct_v, bad);
+ ceph_decode_8_safe(p, end, struct_compat, bad);
+ /* struct_v is expected to be >= 1. we only
+ * understand encoding with struct_compat == 1. */
+ if (!struct_v || struct_compat != 1)
+ goto bad;
+ ceph_decode_32_safe(p, end, struct_len, bad);
+ ceph_decode_need(p, end, struct_len, bad);
+ end = *p + struct_len;
+ ceph_decode_64_safe(p, end, info->max_bytes, bad);
+ ceph_decode_64_safe(p, end, info->max_files, bad);
+ *p = end;
+ return 0;
+bad:
+ return -EIO;
+}
+
/*
* parse individual inode info
*/
@@ -68,8 +97,24 @@ static int parse_reply_info_in(void **p, void *end,
struct ceph_mds_reply_info_in *info,
u64 features)
{
- int err = -EIO;
+ int err = 0;
+ u8 struct_v = 0;
+ if (features == (u64)-1) {
+ u32 struct_len;
+ u8 struct_compat;
+ ceph_decode_8_safe(p, end, struct_v, bad);
+ ceph_decode_8_safe(p, end, struct_compat, bad);
+ /* struct_v is expected to be >= 1. we only understand
+ * encoding with struct_compat == 1. */
+ if (!struct_v || struct_compat != 1)
+ goto bad;
+ ceph_decode_32_safe(p, end, struct_len, bad);
+ ceph_decode_need(p, end, struct_len, bad);
+ end = *p + struct_len;
+ }
+
+ ceph_decode_need(p, end, sizeof(struct ceph_mds_reply_inode), bad);
info->in = *p;
*p += sizeof(struct ceph_mds_reply_inode) +
sizeof(*info->in->fragtree.splits) *
@@ -87,49 +132,136 @@ static int parse_reply_info_in(void **p, void *end,
info->xattr_data = *p;
*p += info->xattr_len;
- if (features & CEPH_FEATURE_MDS_INLINE_DATA) {
+ if (features == (u64)-1) {
+ /* inline data */
ceph_decode_64_safe(p, end, info->inline_version, bad);
ceph_decode_32_safe(p, end, info->inline_len, bad);
ceph_decode_need(p, end, info->inline_len, bad);
info->inline_data = *p;
*p += info->inline_len;
- } else
- info->inline_version = CEPH_INLINE_NONE;
+ /* quota */
+ err = parse_reply_info_quota(p, end, info);
+ if (err < 0)
+ goto out_bad;
+ /* pool namespace */
+ ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
+ if (info->pool_ns_len > 0) {
+ ceph_decode_need(p, end, info->pool_ns_len, bad);
+ info->pool_ns_data = *p;
+ *p += info->pool_ns_len;
+ }
+ /* btime, change_attr */
+ {
+ struct ceph_timespec btime;
+ u64 change_attr;
+ ceph_decode_need(p, end, sizeof(btime), bad);
+ ceph_decode_copy(p, &btime, sizeof(btime));
+ ceph_decode_64_safe(p, end, change_attr, bad);
+ }
+
+ /* dir pin */
+ if (struct_v >= 2) {
+ ceph_decode_32_safe(p, end, info->dir_pin, bad);
+ } else {
+ info->dir_pin = -ENODATA;
+ }
+
+ *p = end;
+ } else {
+ if (features & CEPH_FEATURE_MDS_INLINE_DATA) {
+ ceph_decode_64_safe(p, end, info->inline_version, bad);
+ ceph_decode_32_safe(p, end, info->inline_len, bad);
+ ceph_decode_need(p, end, info->inline_len, bad);
+ info->inline_data = *p;
+ *p += info->inline_len;
+ } else
+ info->inline_version = CEPH_INLINE_NONE;
+
+ if (features & CEPH_FEATURE_MDS_QUOTA) {
+ err = parse_reply_info_quota(p, end, info);
+ if (err < 0)
+ goto out_bad;
+ } else {
+ info->max_bytes = 0;
+ info->max_files = 0;
+ }
+
+ info->pool_ns_len = 0;
+ info->pool_ns_data = NULL;
+ if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
+ ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
+ if (info->pool_ns_len > 0) {
+ ceph_decode_need(p, end, info->pool_ns_len, bad);
+ info->pool_ns_data = *p;
+ *p += info->pool_ns_len;
+ }
+ }
- if (features & CEPH_FEATURE_MDS_QUOTA) {
+ info->dir_pin = -ENODATA;
+ }
+ return 0;
+bad:
+ err = -EIO;
+out_bad:
+ return err;
+}
+
+static int parse_reply_info_dir(void **p, void *end,
+ struct ceph_mds_reply_dirfrag **dirfrag,
+ u64 features)
+{
+ if (features == (u64)-1) {
u8 struct_v, struct_compat;
u32 struct_len;
-
- /*
- * both struct_v and struct_compat are expected to be >= 1
- */
ceph_decode_8_safe(p, end, struct_v, bad);
ceph_decode_8_safe(p, end, struct_compat, bad);
- if (!struct_v || !struct_compat)
+ /* struct_v is expected to be >= 1. we only understand
+ * encoding whose struct_compat == 1. */
+ if (!struct_v || struct_compat != 1)
goto bad;
ceph_decode_32_safe(p, end, struct_len, bad);
ceph_decode_need(p, end, struct_len, bad);
- ceph_decode_64_safe(p, end, info->max_bytes, bad);
- ceph_decode_64_safe(p, end, info->max_files, bad);
- } else {
- info->max_bytes = 0;
- info->max_files = 0;
+ end = *p + struct_len;
}
- info->pool_ns_len = 0;
- info->pool_ns_data = NULL;
- if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
- ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
- if (info->pool_ns_len > 0) {
- ceph_decode_need(p, end, info->pool_ns_len, bad);
- info->pool_ns_data = *p;
- *p += info->pool_ns_len;
- }
+ ceph_decode_need(p, end, sizeof(**dirfrag), bad);
+ *dirfrag = *p;
+ *p += sizeof(**dirfrag) + sizeof(u32) * le32_to_cpu((*dirfrag)->ndist);
+ if (unlikely(*p > end))
+ goto bad;
+ if (features == (u64)-1)
+ *p = end;
+ return 0;
+bad:
+ return -EIO;
+}
+
+static int parse_reply_info_lease(void **p, void *end,
+ struct ceph_mds_reply_lease **lease,
+ u64 features)
+{
+ if (features == (u64)-1) {
+ u8 struct_v, struct_compat;
+ u32 struct_len;
+ ceph_decode_8_safe(p, end, struct_v, bad);
+ ceph_decode_8_safe(p, end, struct_compat, bad);
+ /* struct_v is expected to be >= 1. we only understand
+ * encoding whose struct_compat == 1. */
+ if (!struct_v || struct_compat != 1)
+ goto bad;
+ ceph_decode_32_safe(p, end, struct_len, bad);
+ ceph_decode_need(p, end, struct_len, bad);
+ end = *p + struct_len;
}
+ ceph_decode_need(p, end, sizeof(**lease), bad);
+ *lease = *p;
+ *p += sizeof(**lease);
+ if (features == (u64)-1)
+ *p = end;
return 0;
bad:
- return err;
+ return -EIO;
}
/*
@@ -147,20 +279,18 @@ static int parse_reply_info_trace(void **p, void *end,
if (err < 0)
goto out_bad;
- if (unlikely(*p + sizeof(*info->dirfrag) > end))
- goto bad;
- info->dirfrag = *p;
- *p += sizeof(*info->dirfrag) +
- sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
- if (unlikely(*p > end))
- goto bad;
+ err = parse_reply_info_dir(p, end, &info->dirfrag, features);
+ if (err < 0)
+ goto out_bad;
ceph_decode_32_safe(p, end, info->dname_len, bad);
ceph_decode_need(p, end, info->dname_len, bad);
info->dname = *p;
*p += info->dname_len;
- info->dlease = *p;
- *p += sizeof(*info->dlease);
+
+ err = parse_reply_info_lease(p, end, &info->dlease, features);
+ if (err < 0)
+ goto out_bad;
}
if (info->head->is_target) {
@@ -183,20 +313,16 @@ out_bad:
/*
* parse readdir results
*/
-static int parse_reply_info_dir(void **p, void *end,
+static int parse_reply_info_readdir(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
u64 features)
{
u32 num, i = 0;
int err;
- info->dir_dir = *p;
- if (*p + sizeof(*info->dir_dir) > end)
- goto bad;
- *p += sizeof(*info->dir_dir) +
- sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
- if (*p > end)
- goto bad;
+ err = parse_reply_info_dir(p, end, &info->dir_dir, features);
+ if (err < 0)
+ goto out_bad;
ceph_decode_need(p, end, sizeof(num) + 2, bad);
num = ceph_decode_32(p);
@@ -222,15 +348,16 @@ static int parse_reply_info_dir(void **p, void *end,
while (num) {
struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;
/* dentry */
- ceph_decode_need(p, end, sizeof(u32)*2, bad);
- rde->name_len = ceph_decode_32(p);
+ ceph_decode_32_safe(p, end, rde->name_len, bad);
ceph_decode_need(p, end, rde->name_len, bad);
rde->name = *p;
*p += rde->name_len;
dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name);
- rde->lease = *p;
- *p += sizeof(struct ceph_mds_reply_lease);
+ /* dentry lease */
+ err = parse_reply_info_lease(p, end, &rde->lease, features);
+ if (err)
+ goto out_bad;
/* inode */
err = parse_reply_info_in(p, end, &rde->inode, features);
if (err < 0)
@@ -281,7 +408,8 @@ static int parse_reply_info_create(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
u64 features)
{
- if (features & CEPH_FEATURE_REPLY_CREATE_INODE) {
+ if (features == (u64)-1 ||
+ (features & CEPH_FEATURE_REPLY_CREATE_INODE)) {
if (*p == end) {
info->has_create_ino = false;
} else {
@@ -310,7 +438,7 @@ static int parse_reply_info_extra(void **p, void *end,
if (op == CEPH_MDS_OP_GETFILELOCK)
return parse_reply_info_filelock(p, end, info, features);
else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP)
- return parse_reply_info_dir(p, end, info, features);
+ return parse_reply_info_readdir(p, end, info, features);
else if (op == CEPH_MDS_OP_CREATE)
return parse_reply_info_create(p, end, info, features);
else
@@ -494,7 +622,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr);
spin_lock_init(&s->s_gen_ttl_lock);
- s->s_cap_gen = 0;
+ s->s_cap_gen = 1;
s->s_cap_ttl = jiffies - 1;
spin_lock_init(&s->s_cap_lock);
@@ -510,6 +638,8 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
s->s_cap_reconnect = 0;
s->s_cap_iterator = NULL;
INIT_LIST_HEAD(&s->s_cap_releases);
+ INIT_WORK(&s->s_cap_release_work, ceph_cap_release_work);
+
INIT_LIST_HEAD(&s->s_cap_flushing);
mdsc->sessions[mds] = s;
@@ -535,6 +665,7 @@ static void __unregister_session(struct ceph_mds_client *mdsc,
dout("__unregister_session mds%d %p\n", s->s_mds, s);
BUG_ON(mdsc->sessions[s->s_mds] != s);
mdsc->sessions[s->s_mds] = NULL;
+ s->s_state = 0;
ceph_con_close(&s->s_con);
ceph_put_mds_session(s);
atomic_dec(&mdsc->num_sessions);
@@ -1197,13 +1328,10 @@ static int iterate_session_caps(struct ceph_mds_session *session,
cap->session = NULL;
list_del_init(&cap->session_caps);
session->s_nr_caps--;
- if (cap->queue_release) {
- list_add_tail(&cap->session_caps,
- &session->s_cap_releases);
- session->s_num_cap_releases++;
- } else {
+ if (cap->queue_release)
+ __ceph_queue_cap_release(session, cap);
+ else
old_cap = cap; /* put_cap it w/o locks held */
- }
}
if (ret < 0)
goto out;
@@ -1638,7 +1766,7 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc,
session->s_trim_caps = 0;
}
- ceph_send_cap_releases(mdsc, session);
+ ceph_flush_cap_releases(mdsc, session);
return 0;
}
@@ -1681,8 +1809,8 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc,
/*
* called under s_mutex
*/
-void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
+static void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
{
struct ceph_msg *msg = NULL;
struct ceph_mds_cap_release *head;
@@ -1774,6 +1902,81 @@ out_err:
spin_unlock(&session->s_cap_lock);
}
+static void ceph_cap_release_work(struct work_struct *work)
+{
+ struct ceph_mds_session *session =
+ container_of(work, struct ceph_mds_session, s_cap_release_work);
+
+ mutex_lock(&session->s_mutex);
+ if (session->s_state == CEPH_MDS_SESSION_OPEN ||
+ session->s_state == CEPH_MDS_SESSION_HUNG)
+ ceph_send_cap_releases(session->s_mdsc, session);
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+}
+
+void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ if (mdsc->stopping)
+ return;
+
+ get_session(session);
+ if (queue_work(mdsc->fsc->cap_wq,
+ &session->s_cap_release_work)) {
+ dout("cap release work queued\n");
+ } else {
+ ceph_put_mds_session(session);
+ dout("failed to queue cap release work\n");
+ }
+}
+
+/*
+ * caller holds session->s_cap_lock
+ */
+void __ceph_queue_cap_release(struct ceph_mds_session *session,
+ struct ceph_cap *cap)
+{
+ list_add_tail(&cap->session_caps, &session->s_cap_releases);
+ session->s_num_cap_releases++;
+
+ if (!(session->s_num_cap_releases % CEPH_CAPS_PER_RELEASE))
+ ceph_flush_cap_releases(session->s_mdsc, session);
+}
+
+static void ceph_cap_reclaim_work(struct work_struct *work)
+{
+ struct ceph_mds_client *mdsc =
+ container_of(work, struct ceph_mds_client, cap_reclaim_work);
+ int ret = ceph_trim_dentries(mdsc);
+ if (ret == -EAGAIN)
+ ceph_queue_cap_reclaim_work(mdsc);
+}
+
+void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc)
+{
+ if (mdsc->stopping)
+ return;
+
+ if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_reclaim_work)) {
+ dout("caps reclaim work queued\n");
+ } else {
+ dout("failed to queue caps release work\n");
+ }
+}
+
+void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr)
+{
+ int val;
+ if (!nr)
+ return;
+ val = atomic_add_return(nr, &mdsc->cap_reclaim_pending);
+ if (!(val % CEPH_CAPS_PER_RELEASE)) {
+ atomic_set(&mdsc->cap_reclaim_pending, 0);
+ ceph_queue_cap_reclaim_work(mdsc);
+ }
+}
+
/*
* requests
*/
@@ -2653,7 +2856,10 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
dout("handle_reply tid %lld result %d\n", tid, result);
rinfo = &req->r_reply_info;
- err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
+ if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features))
+ err = parse_reply_info(msg, rinfo, (u64)-1);
+ else
+ err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
mutex_unlock(&mdsc->mutex);
mutex_lock(&session->s_mutex);
@@ -2684,7 +2890,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
req->r_op == CEPH_MDS_OP_LSSNAP))
ceph_readdir_prepopulate(req, req->r_session);
- ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
}
current->journal_info = NULL;
mutex_unlock(&req->r_fill_mutex);
@@ -2693,12 +2898,18 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
if (realm)
ceph_put_snap_realm(mdsc, realm);
- if (err == 0 && req->r_target_inode &&
- test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
- struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
- spin_lock(&ci->i_unsafe_lock);
- list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops);
- spin_unlock(&ci->i_unsafe_lock);
+ if (err == 0) {
+ if (req->r_target_inode &&
+ test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
+ struct ceph_inode_info *ci =
+ ceph_inode(req->r_target_inode);
+ spin_lock(&ci->i_unsafe_lock);
+ list_add_tail(&req->r_unsafe_target_item,
+ &ci->i_unsafe_iops);
+ spin_unlock(&ci->i_unsafe_lock);
+ }
+
+ ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
}
out_err:
mutex_lock(&mdsc->mutex);
@@ -2777,6 +2988,25 @@ bad:
pr_err("mdsc_handle_forward decode error err=%d\n", err);
}
+static int __decode_and_drop_session_metadata(void **p, void *end)
+{
+ /* map<string,string> */
+ u32 n;
+ ceph_decode_32_safe(p, end, n, bad);
+ while (n-- > 0) {
+ u32 len;
+ ceph_decode_32_safe(p, end, len, bad);
+ ceph_decode_need(p, end, len, bad);
+ *p += len;
+ ceph_decode_32_safe(p, end, len, bad);
+ ceph_decode_need(p, end, len, bad);
+ *p += len;
+ }
+ return 0;
+bad:
+ return -1;
+}
+
/*
* handle a mds session control message
*/
@@ -2784,18 +3014,36 @@ static void handle_session(struct ceph_mds_session *session,
struct ceph_msg *msg)
{
struct ceph_mds_client *mdsc = session->s_mdsc;
+ int mds = session->s_mds;
+ int msg_version = le16_to_cpu(msg->hdr.version);
+ void *p = msg->front.iov_base;
+ void *end = p + msg->front.iov_len;
+ struct ceph_mds_session_head *h;
u32 op;
u64 seq;
- int mds = session->s_mds;
- struct ceph_mds_session_head *h = msg->front.iov_base;
+ unsigned long features = 0;
int wake = 0;
/* decode */
- if (msg->front.iov_len < sizeof(*h))
- goto bad;
+ ceph_decode_need(&p, end, sizeof(*h), bad);
+ h = p;
+ p += sizeof(*h);
+
op = le32_to_cpu(h->op);
seq = le64_to_cpu(h->seq);
+ if (msg_version >= 3) {
+ u32 len;
+ /* version >= 2, metadata */
+ if (__decode_and_drop_session_metadata(&p, end) < 0)
+ goto bad;
+ /* version >= 3, feature bits */
+ ceph_decode_32_safe(&p, end, len, bad);
+ ceph_decode_need(&p, end, len, bad);
+ memcpy(&features, p, min_t(size_t, len, sizeof(features)));
+ p += len;
+ }
+
mutex_lock(&mdsc->mutex);
if (op == CEPH_SESSION_CLOSE) {
get_session(session);
@@ -2821,6 +3069,7 @@ static void handle_session(struct ceph_mds_session *session,
if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
pr_info("mds%d reconnect success\n", session->s_mds);
session->s_state = CEPH_MDS_SESSION_OPEN;
+ session->s_features = features;
renewed_caps(mdsc, session, 0);
wake = 1;
if (mdsc->stopping)
@@ -2947,6 +3196,82 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
mutex_unlock(&mdsc->mutex);
}
+static int send_reconnect_partial(struct ceph_reconnect_state *recon_state)
+{
+ struct ceph_msg *reply;
+ struct ceph_pagelist *_pagelist;
+ struct page *page;
+ __le32 *addr;
+ int err = -ENOMEM;
+
+ if (!recon_state->allow_multi)
+ return -ENOSPC;
+
+ /* can't handle message that contains both caps and realm */
+ BUG_ON(!recon_state->nr_caps == !recon_state->nr_realms);
+
+ /* pre-allocate new pagelist */
+ _pagelist = ceph_pagelist_alloc(GFP_NOFS);
+ if (!_pagelist)
+ return -ENOMEM;
+
+ reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false);
+ if (!reply)
+ goto fail_msg;
+
+ /* placeholder for nr_caps */
+ err = ceph_pagelist_encode_32(_pagelist, 0);
+ if (err < 0)
+ goto fail;
+
+ if (recon_state->nr_caps) {
+ /* currently encoding caps */
+ err = ceph_pagelist_encode_32(recon_state->pagelist, 0);
+ if (err)
+ goto fail;
+ } else {
+ /* placeholder for nr_realms (currently encoding relams) */
+ err = ceph_pagelist_encode_32(_pagelist, 0);
+ if (err < 0)
+ goto fail;
+ }
+
+ err = ceph_pagelist_encode_8(recon_state->pagelist, 1);
+ if (err)
+ goto fail;
+
+ page = list_first_entry(&recon_state->pagelist->head, struct page, lru);
+ addr = kmap_atomic(page);
+ if (recon_state->nr_caps) {
+ /* currently encoding caps */
+ *addr = cpu_to_le32(recon_state->nr_caps);
+ } else {
+ /* currently encoding relams */
+ *(addr + 1) = cpu_to_le32(recon_state->nr_realms);
+ }
+ kunmap_atomic(addr);
+
+ reply->hdr.version = cpu_to_le16(5);
+ reply->hdr.compat_version = cpu_to_le16(4);
+
+ reply->hdr.data_len = cpu_to_le32(recon_state->pagelist->length);
+ ceph_msg_data_add_pagelist(reply, recon_state->pagelist);
+
+ ceph_con_send(&recon_state->session->s_con, reply);
+ ceph_pagelist_release(recon_state->pagelist);
+
+ recon_state->pagelist = _pagelist;
+ recon_state->nr_caps = 0;
+ recon_state->nr_realms = 0;
+ recon_state->msg_version = 5;
+ return 0;
+fail:
+ ceph_msg_put(reply);
+fail_msg:
+ ceph_pagelist_release(_pagelist);
+ return err;
+}
+
/*
* Encode information about a cap for a reconnect with the MDS.
*/
@@ -2966,9 +3291,6 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
inode, ceph_vinop(inode), cap, cap->cap_id,
ceph_cap_string(cap->issued));
- err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
- if (err)
- return err;
spin_lock(&ci->i_ceph_lock);
cap->seq = 0; /* reset cap seq */
@@ -3008,7 +3330,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
if (recon_state->msg_version >= 2) {
int num_fcntl_locks, num_flock_locks;
struct ceph_filelock *flocks = NULL;
- size_t struct_len, total_len = 0;
+ size_t struct_len, total_len = sizeof(u64);
u8 struct_v = 0;
encode_again:
@@ -3043,7 +3365,7 @@ encode_again:
if (recon_state->msg_version >= 3) {
/* version, compat_version and struct_len */
- total_len = 2 * sizeof(u8) + sizeof(u32);
+ total_len += 2 * sizeof(u8) + sizeof(u32);
struct_v = 2;
}
/*
@@ -3060,12 +3382,19 @@ encode_again:
struct_len += sizeof(u64); /* snap_follows */
total_len += struct_len;
- err = ceph_pagelist_reserve(pagelist, total_len);
- if (err) {
- kfree(flocks);
- goto out_err;
+
+ if (pagelist->length + total_len > RECONNECT_MAX_SIZE) {
+ err = send_reconnect_partial(recon_state);
+ if (err)
+ goto out_freeflocks;
+ pagelist = recon_state->pagelist;
}
+ err = ceph_pagelist_reserve(pagelist, total_len);
+ if (err)
+ goto out_freeflocks;
+
+ ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
if (recon_state->msg_version >= 3) {
ceph_pagelist_encode_8(pagelist, struct_v);
ceph_pagelist_encode_8(pagelist, 1);
@@ -3077,7 +3406,7 @@ encode_again:
num_fcntl_locks, num_flock_locks);
if (struct_v >= 2)
ceph_pagelist_encode_64(pagelist, snap_follows);
-
+out_freeflocks:
kfree(flocks);
} else {
u64 pathbase = 0;
@@ -3098,20 +3427,81 @@ encode_again:
}
err = ceph_pagelist_reserve(pagelist,
- pathlen + sizeof(u32) + sizeof(rec.v1));
+ sizeof(u64) + sizeof(u32) +
+ pathlen + sizeof(rec.v1));
if (err) {
- kfree(path);
- goto out_err;
+ goto out_freepath;
}
+ ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
ceph_pagelist_encode_string(pagelist, path, pathlen);
ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
-
+out_freepath:
kfree(path);
}
- recon_state->nr_caps++;
out_err:
+ if (err >= 0)
+ recon_state->nr_caps++;
+ return err;
+}
+
+static int encode_snap_realms(struct ceph_mds_client *mdsc,
+ struct ceph_reconnect_state *recon_state)
+{
+ struct rb_node *p;
+ struct ceph_pagelist *pagelist = recon_state->pagelist;
+ int err = 0;
+
+ if (recon_state->msg_version >= 4) {
+ err = ceph_pagelist_encode_32(pagelist, mdsc->num_snap_realms);
+ if (err < 0)
+ goto fail;
+ }
+
+ /*
+ * snaprealms. we provide mds with the ino, seq (version), and
+ * parent for all of our realms. If the mds has any newer info,
+ * it will tell us.
+ */
+ for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
+ struct ceph_snap_realm *realm =
+ rb_entry(p, struct ceph_snap_realm, node);
+ struct ceph_mds_snaprealm_reconnect sr_rec;
+
+ if (recon_state->msg_version >= 4) {
+ size_t need = sizeof(u8) * 2 + sizeof(u32) +
+ sizeof(sr_rec);
+
+ if (pagelist->length + need > RECONNECT_MAX_SIZE) {
+ err = send_reconnect_partial(recon_state);
+ if (err)
+ goto fail;
+ pagelist = recon_state->pagelist;
+ }
+
+ err = ceph_pagelist_reserve(pagelist, need);
+ if (err)
+ goto fail;
+
+ ceph_pagelist_encode_8(pagelist, 1);
+ ceph_pagelist_encode_8(pagelist, 1);
+ ceph_pagelist_encode_32(pagelist, sizeof(sr_rec));
+ }
+
+ dout(" adding snap realm %llx seq %lld parent %llx\n",
+ realm->ino, realm->seq, realm->parent_ino);
+ sr_rec.ino = cpu_to_le64(realm->ino);
+ sr_rec.seq = cpu_to_le64(realm->seq);
+ sr_rec.parent = cpu_to_le64(realm->parent_ino);
+
+ err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
+ if (err)
+ goto fail;
+
+ recon_state->nr_realms++;
+ }
+fail:
return err;
}
@@ -3132,18 +3522,17 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
struct ceph_msg *reply;
- struct rb_node *p;
int mds = session->s_mds;
int err = -ENOMEM;
- int s_nr_caps;
- struct ceph_pagelist *pagelist;
- struct ceph_reconnect_state recon_state;
+ struct ceph_reconnect_state recon_state = {
+ .session = session,
+ };
LIST_HEAD(dispose);
pr_info("mds%d reconnect start\n", mds);
- pagelist = ceph_pagelist_alloc(GFP_NOFS);
- if (!pagelist)
+ recon_state.pagelist = ceph_pagelist_alloc(GFP_NOFS);
+ if (!recon_state.pagelist)
goto fail_nopagelist;
reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false);
@@ -3187,63 +3576,90 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
/* replay unsafe requests */
replay_unsafe_requests(mdsc, session);
+ ceph_early_kick_flushing_caps(mdsc, session);
+
down_read(&mdsc->snap_rwsem);
- /* traverse this session's caps */
- s_nr_caps = session->s_nr_caps;
- err = ceph_pagelist_encode_32(pagelist, s_nr_caps);
+ /* placeholder for nr_caps */
+ err = ceph_pagelist_encode_32(recon_state.pagelist, 0);
if (err)
goto fail;
- recon_state.nr_caps = 0;
- recon_state.pagelist = pagelist;
- if (session->s_con.peer_features & CEPH_FEATURE_MDSENC)
+ if (test_bit(CEPHFS_FEATURE_MULTI_RECONNECT, &session->s_features)) {
recon_state.msg_version = 3;
- else
+ recon_state.allow_multi = true;
+ } else if (session->s_con.peer_features & CEPH_FEATURE_MDSENC) {
+ recon_state.msg_version = 3;
+ } else {
recon_state.msg_version = 2;
+ }
+ /* trsaverse this session's caps */
err = iterate_session_caps(session, encode_caps_cb, &recon_state);
- if (err < 0)
- goto fail;
spin_lock(&session->s_cap_lock);
session->s_cap_reconnect = 0;
spin_unlock(&session->s_cap_lock);
- /*
- * snaprealms. we provide mds with the ino, seq (version), and
- * parent for all of our realms. If the mds has any newer info,
- * it will tell us.
- */
- for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
- struct ceph_snap_realm *realm =
- rb_entry(p, struct ceph_snap_realm, node);
- struct ceph_mds_snaprealm_reconnect sr_rec;
+ if (err < 0)
+ goto fail;
- dout(" adding snap realm %llx seq %lld parent %llx\n",
- realm->ino, realm->seq, realm->parent_ino);
- sr_rec.ino = cpu_to_le64(realm->ino);
- sr_rec.seq = cpu_to_le64(realm->seq);
- sr_rec.parent = cpu_to_le64(realm->parent_ino);
- err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
- if (err)
- goto fail;
+ /* check if all realms can be encoded into current message */
+ if (mdsc->num_snap_realms) {
+ size_t total_len =
+ recon_state.pagelist->length +
+ mdsc->num_snap_realms *
+ sizeof(struct ceph_mds_snaprealm_reconnect);
+ if (recon_state.msg_version >= 4) {
+ /* number of realms */
+ total_len += sizeof(u32);
+ /* version, compat_version and struct_len */
+ total_len += mdsc->num_snap_realms *
+ (2 * sizeof(u8) + sizeof(u32));
+ }
+ if (total_len > RECONNECT_MAX_SIZE) {
+ if (!recon_state.allow_multi) {
+ err = -ENOSPC;
+ goto fail;
+ }
+ if (recon_state.nr_caps) {
+ err = send_reconnect_partial(&recon_state);
+ if (err)
+ goto fail;
+ }
+ recon_state.msg_version = 5;
+ }
}
- reply->hdr.version = cpu_to_le16(recon_state.msg_version);
+ err = encode_snap_realms(mdsc, &recon_state);
+ if (err < 0)
+ goto fail;
- /* raced with cap release? */
- if (s_nr_caps != recon_state.nr_caps) {
- struct page *page = list_first_entry(&pagelist->head,
- struct page, lru);
+ if (recon_state.msg_version >= 5) {
+ err = ceph_pagelist_encode_8(recon_state.pagelist, 0);
+ if (err < 0)
+ goto fail;
+ }
+
+ if (recon_state.nr_caps || recon_state.nr_realms) {
+ struct page *page =
+ list_first_entry(&recon_state.pagelist->head,
+ struct page, lru);
__le32 *addr = kmap_atomic(page);
- *addr = cpu_to_le32(recon_state.nr_caps);
+ if (recon_state.nr_caps) {
+ WARN_ON(recon_state.nr_realms != mdsc->num_snap_realms);
+ *addr = cpu_to_le32(recon_state.nr_caps);
+ } else if (recon_state.msg_version >= 4) {
+ *(addr + 1) = cpu_to_le32(recon_state.nr_realms);
+ }
kunmap_atomic(addr);
}
- reply->hdr.data_len = cpu_to_le32(pagelist->length);
- ceph_msg_data_add_pagelist(reply, pagelist);
+ reply->hdr.version = cpu_to_le16(recon_state.msg_version);
+ if (recon_state.msg_version >= 4)
+ reply->hdr.compat_version = cpu_to_le16(4);
- ceph_early_kick_flushing_caps(mdsc, session);
+ reply->hdr.data_len = cpu_to_le32(recon_state.pagelist->length);
+ ceph_msg_data_add_pagelist(reply, recon_state.pagelist);
ceph_con_send(&session->s_con, reply);
@@ -3254,7 +3670,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
mutex_unlock(&mdsc->mutex);
up_read(&mdsc->snap_rwsem);
- ceph_pagelist_release(pagelist);
+ ceph_pagelist_release(recon_state.pagelist);
return;
fail:
@@ -3262,7 +3678,7 @@ fail:
up_read(&mdsc->snap_rwsem);
mutex_unlock(&session->s_mutex);
fail_nomsg:
- ceph_pagelist_release(pagelist);
+ ceph_pagelist_release(recon_state.pagelist);
fail_nopagelist:
pr_err("error %d preparing reconnect for mds%d\n", err, mds);
return;
@@ -3580,7 +3996,6 @@ static void delayed_work(struct work_struct *work)
int renew_caps;
dout("mdsc delayed_work\n");
- ceph_check_delayed_caps(mdsc);
mutex_lock(&mdsc->mutex);
renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
@@ -3628,6 +4043,12 @@ static void delayed_work(struct work_struct *work)
}
mutex_unlock(&mdsc->mutex);
+ ceph_check_delayed_caps(mdsc);
+
+ ceph_queue_cap_reclaim_work(mdsc);
+
+ ceph_trim_snapid_map(mdsc);
+
schedule_delayed(mdsc);
}
@@ -3660,6 +4081,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
init_rwsem(&mdsc->snap_rwsem);
mdsc->snap_realms = RB_ROOT;
INIT_LIST_HEAD(&mdsc->snap_empty);
+ mdsc->num_snap_realms = 0;
spin_lock_init(&mdsc->snap_empty_lock);
mdsc->last_tid = 0;
mdsc->oldest_tid = 0;
@@ -3677,11 +4099,19 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
mdsc->num_cap_flushing = 0;
spin_lock_init(&mdsc->cap_dirty_lock);
init_waitqueue_head(&mdsc->cap_flushing_wq);
- spin_lock_init(&mdsc->dentry_lru_lock);
- INIT_LIST_HEAD(&mdsc->dentry_lru);
+ INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work);
+ atomic_set(&mdsc->cap_reclaim_pending, 0);
+
+ spin_lock_init(&mdsc->dentry_list_lock);
+ INIT_LIST_HEAD(&mdsc->dentry_leases);
+ INIT_LIST_HEAD(&mdsc->dentry_dir_leases);
ceph_caps_init(mdsc);
- ceph_adjust_min_caps(mdsc, fsc->min_caps);
+ ceph_adjust_caps_max_min(mdsc, fsc->mount_options);
+
+ spin_lock_init(&mdsc->snapid_map_lock);
+ mdsc->snapid_map_tree = RB_ROOT;
+ INIT_LIST_HEAD(&mdsc->snapid_map_lru);
init_rwsem(&mdsc->pool_perm_rwsem);
mdsc->pool_perm_tree = RB_ROOT;
@@ -3876,8 +4306,10 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
WARN_ON(!list_empty(&mdsc->cap_delay_list));
mutex_unlock(&mdsc->mutex);
+ ceph_cleanup_snapid_map(mdsc);
ceph_cleanup_empty_realms(mdsc);
+ cancel_work_sync(&mdsc->cap_reclaim_work);
cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
dout("stopped\n");
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 729da155ebf0..50385a481fdb 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -21,11 +21,14 @@
#define CEPHFS_FEATURE_REPLY_ENCODING 9
#define CEPHFS_FEATURE_RECLAIM_CLIENT 10
#define CEPHFS_FEATURE_LAZY_CAP_WANTED 11
+#define CEPHFS_FEATURE_MULTI_RECONNECT 12
#define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
0, 1, 2, 3, 4, 5, 6, 7, \
CEPHFS_FEATURE_MIMIC, \
+ CEPHFS_FEATURE_REPLY_ENCODING, \
CEPHFS_FEATURE_LAZY_CAP_WANTED, \
+ CEPHFS_FEATURE_MULTI_RECONNECT, \
}
#define CEPHFS_FEATURES_CLIENT_REQUIRED {}
@@ -65,6 +68,7 @@ struct ceph_mds_reply_info_in {
char *pool_ns_data;
u64 max_bytes;
u64 max_files;
+ s32 dir_pin;
};
struct ceph_mds_reply_dir_entry {
@@ -152,6 +156,7 @@ struct ceph_mds_session {
int s_mds;
int s_state;
unsigned long s_ttl; /* time until mds kills us */
+ unsigned long s_features;
u64 s_seq; /* incoming msg seq # */
struct mutex s_mutex; /* serialize session messages */
@@ -167,19 +172,20 @@ struct ceph_mds_session {
/* protected by s_cap_lock */
spinlock_t s_cap_lock;
struct list_head s_caps; /* all caps issued by this session */
+ struct ceph_cap *s_cap_iterator;
int s_nr_caps, s_trim_caps;
int s_num_cap_releases;
int s_cap_reconnect;
int s_readonly;
struct list_head s_cap_releases; /* waiting cap_release messages */
- struct ceph_cap *s_cap_iterator;
+ struct work_struct s_cap_release_work;
/* protected by mutex */
struct list_head s_cap_flushing; /* inodes w/ flushing caps */
unsigned long s_renew_requested; /* last time we sent a renew req */
u64 s_renew_seq;
- refcount_t s_ref;
+ refcount_t s_ref;
struct list_head s_waiting; /* waiting requests */
struct list_head s_unsafe; /* unsafe requests */
};
@@ -310,6 +316,15 @@ struct ceph_pool_perm {
char pool_ns[];
};
+struct ceph_snapid_map {
+ struct rb_node node;
+ struct list_head lru;
+ atomic_t ref;
+ u64 snap;
+ dev_t dev;
+ unsigned long last_used;
+};
+
/*
* mds client state
*/
@@ -341,6 +356,7 @@ struct ceph_mds_client {
struct rw_semaphore snap_rwsem;
struct rb_root snap_realms;
struct list_head snap_empty;
+ int num_snap_realms;
spinlock_t snap_empty_lock; /* protect snap_empty */
u64 last_tid; /* most recent mds request */
@@ -362,6 +378,9 @@ struct ceph_mds_client {
spinlock_t cap_dirty_lock; /* protects above items */
wait_queue_head_t cap_flushing_wq;
+ struct work_struct cap_reclaim_work;
+ atomic_t cap_reclaim_pending;
+
/*
* Cap reservations
*
@@ -378,13 +397,18 @@ struct ceph_mds_client {
unreserved) */
int caps_total_count; /* total caps allocated */
int caps_use_count; /* in use */
+ int caps_use_max; /* max used caps */
int caps_reserve_count; /* unused, reserved */
int caps_avail_count; /* unused, unreserved */
int caps_min_count; /* keep at least this many
(unreserved) */
- spinlock_t dentry_lru_lock;
- struct list_head dentry_lru;
- int num_dentry;
+ spinlock_t dentry_list_lock;
+ struct list_head dentry_leases; /* fifo list */
+ struct list_head dentry_dir_leases; /* lru list */
+
+ spinlock_t snapid_map_lock;
+ struct rb_root snapid_map_tree;
+ struct list_head snapid_map_lru;
struct rw_semaphore pool_perm_rwsem;
struct rb_root pool_perm_tree;
@@ -438,9 +462,12 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
kref_put(&req->r_kref, ceph_mdsc_release_request);
}
-extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session);
-
+extern void __ceph_queue_cap_release(struct ceph_mds_session *session,
+ struct ceph_cap *cap);
+extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session);
+extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc);
+extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr);
extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index f74193da0e09..89aa37fa0f84 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -3,12 +3,13 @@
#include <linux/sort.h>
#include <linux/slab.h>
-
#include "super.h"
#include "mds_client.h"
-
#include <linux/ceph/decode.h>
+/* unused map expires after 5 minutes */
+#define CEPH_SNAPID_MAP_TIMEOUT (5 * 60 * HZ)
+
/*
* Snapshots in ceph are driven in large part by cooperation from the
* client. In contrast to local file systems or file servers that
@@ -124,6 +125,8 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
INIT_LIST_HEAD(&realm->inodes_with_caps);
spin_lock_init(&realm->inodes_with_caps_lock);
__insert_snap_realm(&mdsc->snap_realms, realm);
+ mdsc->num_snap_realms++;
+
dout("create_snap_realm %llx %p\n", realm->ino, realm);
return realm;
}
@@ -175,6 +178,7 @@ static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
rb_erase(&realm->node, &mdsc->snap_realms);
+ mdsc->num_snap_realms--;
if (realm->parent) {
list_del_init(&realm->child_item);
@@ -986,3 +990,154 @@ out:
up_write(&mdsc->snap_rwsem);
return;
}
+
+struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
+ u64 snap)
+{
+ struct ceph_snapid_map *sm, *exist;
+ struct rb_node **p, *parent;
+ int ret;
+
+ exist = NULL;
+ spin_lock(&mdsc->snapid_map_lock);
+ p = &mdsc->snapid_map_tree.rb_node;
+ while (*p) {
+ exist = rb_entry(*p, struct ceph_snapid_map, node);
+ if (snap > exist->snap) {
+ p = &(*p)->rb_left;
+ } else if (snap < exist->snap) {
+ p = &(*p)->rb_right;
+ } else {
+ if (atomic_inc_return(&exist->ref) == 1)
+ list_del_init(&exist->lru);
+ break;
+ }
+ exist = NULL;
+ }
+ spin_unlock(&mdsc->snapid_map_lock);
+ if (exist) {
+ dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
+ return exist;
+ }
+
+ sm = kmalloc(sizeof(*sm), GFP_NOFS);
+ if (!sm)
+ return NULL;
+
+ ret = get_anon_bdev(&sm->dev);
+ if (ret < 0) {
+ kfree(sm);
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&sm->lru);
+ atomic_set(&sm->ref, 1);
+ sm->snap = snap;
+
+ exist = NULL;
+ parent = NULL;
+ p = &mdsc->snapid_map_tree.rb_node;
+ spin_lock(&mdsc->snapid_map_lock);
+ while (*p) {
+ parent = *p;
+ exist = rb_entry(*p, struct ceph_snapid_map, node);
+ if (snap > exist->snap)
+ p = &(*p)->rb_left;
+ else if (snap < exist->snap)
+ p = &(*p)->rb_right;
+ else
+ break;
+ exist = NULL;
+ }
+ if (exist) {
+ if (atomic_inc_return(&exist->ref) == 1)
+ list_del_init(&exist->lru);
+ } else {
+ rb_link_node(&sm->node, parent, p);
+ rb_insert_color(&sm->node, &mdsc->snapid_map_tree);
+ }
+ spin_unlock(&mdsc->snapid_map_lock);
+ if (exist) {
+ free_anon_bdev(sm->dev);
+ kfree(sm);
+ dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
+ return exist;
+ }
+
+ dout("create snapid map %llx -> %x\n", sm->snap, sm->dev);
+ return sm;
+}
+
+void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
+ struct ceph_snapid_map *sm)
+{
+ if (!sm)
+ return;
+ if (atomic_dec_and_lock(&sm->ref, &mdsc->snapid_map_lock)) {
+ if (!RB_EMPTY_NODE(&sm->node)) {
+ sm->last_used = jiffies;
+ list_add_tail(&sm->lru, &mdsc->snapid_map_lru);
+ spin_unlock(&mdsc->snapid_map_lock);
+ } else {
+ /* already cleaned up by
+ * ceph_cleanup_snapid_map() */
+ spin_unlock(&mdsc->snapid_map_lock);
+ kfree(sm);
+ }
+ }
+}
+
+void ceph_trim_snapid_map(struct ceph_mds_client *mdsc)
+{
+ struct ceph_snapid_map *sm;
+ unsigned long now;
+ LIST_HEAD(to_free);
+
+ spin_lock(&mdsc->snapid_map_lock);
+ now = jiffies;
+
+ while (!list_empty(&mdsc->snapid_map_lru)) {
+ sm = list_first_entry(&mdsc->snapid_map_lru,
+ struct ceph_snapid_map, lru);
+ if (time_after(sm->last_used + CEPH_SNAPID_MAP_TIMEOUT, now))
+ break;
+
+ rb_erase(&sm->node, &mdsc->snapid_map_tree);
+ list_move(&sm->lru, &to_free);
+ }
+ spin_unlock(&mdsc->snapid_map_lock);
+
+ while (!list_empty(&to_free)) {
+ sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
+ list_del(&sm->lru);
+ dout("trim snapid map %llx -> %x\n", sm->snap, sm->dev);
+ free_anon_bdev(sm->dev);
+ kfree(sm);
+ }
+}
+
+void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc)
+{
+ struct ceph_snapid_map *sm;
+ struct rb_node *p;
+ LIST_HEAD(to_free);
+
+ spin_lock(&mdsc->snapid_map_lock);
+ while ((p = rb_first(&mdsc->snapid_map_tree))) {
+ sm = rb_entry(p, struct ceph_snapid_map, node);
+ rb_erase(p, &mdsc->snapid_map_tree);
+ RB_CLEAR_NODE(p);
+ list_move(&sm->lru, &to_free);
+ }
+ spin_unlock(&mdsc->snapid_map_lock);
+
+ while (!list_empty(&to_free)) {
+ sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
+ list_del(&sm->lru);
+ free_anon_bdev(sm->dev);
+ if (WARN_ON_ONCE(atomic_read(&sm->ref))) {
+ pr_err("snapid map %llx -> %x still in use\n",
+ sm->snap, sm->dev);
+ }
+ }
+}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index da2cd8e89062..6d5bb2f74612 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -133,6 +133,7 @@ enum {
Opt_rasize,
Opt_caps_wanted_delay_min,
Opt_caps_wanted_delay_max,
+ Opt_caps_max,
Opt_readdir_max_entries,
Opt_readdir_max_bytes,
Opt_congestion_kb,
@@ -175,6 +176,7 @@ static match_table_t fsopt_tokens = {
{Opt_rasize, "rasize=%d"},
{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
+ {Opt_caps_max, "caps_max=%d"},
{Opt_readdir_max_entries, "readdir_max_entries=%d"},
{Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
{Opt_congestion_kb, "write_congestion_kb=%d"},
@@ -286,6 +288,11 @@ static int parse_fsopt_token(char *c, void *private)
return -EINVAL;
fsopt->caps_wanted_delay_max = intval;
break;
+ case Opt_caps_max:
+ if (intval < 0)
+ return -EINVAL;
+ fsopt->caps_max = intval;
+ break;
case Opt_readdir_max_entries:
if (intval < 1)
return -EINVAL;
@@ -576,6 +583,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_printf(m, ",rasize=%d", fsopt->rasize);
if (fsopt->congestion_kb != default_congestion_kb())
seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
+ if (fsopt->caps_max)
+ seq_printf(m, ",caps_max=%d", fsopt->caps_max);
if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
seq_printf(m, ",caps_wanted_delay_min=%d",
fsopt->caps_wanted_delay_min);
@@ -671,6 +680,9 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1);
if (!fsc->trunc_wq)
goto fail_pg_inv_wq;
+ fsc->cap_wq = alloc_workqueue("ceph-cap", 0, 1);
+ if (!fsc->cap_wq)
+ goto fail_trunc_wq;
/* set up mempools */
err = -ENOMEM;
@@ -678,13 +690,12 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
size = sizeof (struct page *) * (page_count ? page_count : 1);
fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size);
if (!fsc->wb_pagevec_pool)
- goto fail_trunc_wq;
-
- /* caps */
- fsc->min_caps = fsopt->max_readdir;
+ goto fail_cap_wq;
return fsc;
+fail_cap_wq:
+ destroy_workqueue(fsc->cap_wq);
fail_trunc_wq:
destroy_workqueue(fsc->trunc_wq);
fail_pg_inv_wq:
@@ -706,6 +717,7 @@ static void flush_fs_workqueues(struct ceph_fs_client *fsc)
flush_workqueue(fsc->wb_wq);
flush_workqueue(fsc->pg_inv_wq);
flush_workqueue(fsc->trunc_wq);
+ flush_workqueue(fsc->cap_wq);
}
static void destroy_fs_client(struct ceph_fs_client *fsc)
@@ -715,6 +727,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
destroy_workqueue(fsc->wb_wq);
destroy_workqueue(fsc->pg_inv_wq);
destroy_workqueue(fsc->trunc_wq);
+ destroy_workqueue(fsc->cap_wq);
mempool_destroy(fsc->wb_pagevec_pool);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index dfb64a5211b6..16c03188578e 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -79,6 +79,7 @@ struct ceph_mount_options {
int rasize; /* max readahead */
int congestion_kb; /* max writeback in flight */
int caps_wanted_delay_min, caps_wanted_delay_max;
+ int caps_max;
int max_readdir; /* max readdir result (entires) */
int max_readdir_bytes; /* max readdir result (bytes) */
@@ -100,17 +101,18 @@ struct ceph_fs_client {
struct ceph_client *client;
unsigned long mount_state;
- int min_caps; /* min caps i added */
loff_t max_file_size;
struct ceph_mds_client *mdsc;
/* writeback */
mempool_t *wb_pagevec_pool;
+ atomic_long_t writeback_count;
+
struct workqueue_struct *wb_wq;
struct workqueue_struct *pg_inv_wq;
struct workqueue_struct *trunc_wq;
- atomic_long_t writeback_count;
+ struct workqueue_struct *cap_wq;
#ifdef CONFIG_DEBUG_FS
struct dentry *debugfs_dentry_lru, *debugfs_caps;
@@ -260,17 +262,22 @@ struct ceph_inode_xattr {
* Ceph dentry state
*/
struct ceph_dentry_info {
+ struct dentry *dentry;
struct ceph_mds_session *lease_session;
+ struct list_head lease_list;
+ unsigned flags;
int lease_shared_gen;
u32 lease_gen;
u32 lease_seq;
unsigned long lease_renew_after, lease_renew_from;
- struct list_head lru;
- struct dentry *dentry;
unsigned long time;
u64 offset;
};
+#define CEPH_DENTRY_REFERENCED 1
+#define CEPH_DENTRY_LEASE_LIST 2
+#define CEPH_DENTRY_SHRINK_LIST 4
+
struct ceph_inode_xattrs_info {
/*
* (still encoded) xattr blob. we avoid the overhead of parsing
@@ -318,6 +325,8 @@ struct ceph_inode_info {
/* quotas */
u64 i_max_bytes, i_max_files;
+ s32 i_dir_pin;
+
struct rb_root i_fragtree;
int i_fragtree_nsplits;
struct mutex i_fragtree_mutex;
@@ -370,7 +379,10 @@ struct ceph_inode_info {
struct list_head i_unsafe_iops; /* uncommitted mds inode ops */
spinlock_t i_unsafe_lock;
- struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
+ union {
+ struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
+ struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */
+ };
int i_snap_realm_counter; /* snap realm (if caps) */
struct list_head i_snap_realm_item;
struct list_head i_snap_flush_item;
@@ -587,7 +599,7 @@ extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
struct ceph_inode_frag *pfrag,
int *found);
-static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
+static inline struct ceph_dentry_info *ceph_dentry(const struct dentry *dentry)
{
return (struct ceph_dentry_info *)dentry->d_fsdata;
}
@@ -656,7 +668,8 @@ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check);
extern void ceph_caps_init(struct ceph_mds_client *mdsc);
extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
-extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta);
+extern void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc,
+ struct ceph_mount_options *fsopt);
extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx, int need);
extern void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
@@ -837,6 +850,14 @@ extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap);
extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
+extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc,
+ u64 snap);
+extern void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
+ struct ceph_snapid_map *sm);
+extern void ceph_trim_snapid_map(struct ceph_mds_client *mdsc);
+extern void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc);
+
+
/*
* a cap_snap is "pending" if it is still awaiting an in-progress
* sync write (that may/may not still update size, mtime, etc.).
@@ -975,11 +996,11 @@ extern void ceph_add_cap(struct inode *inode,
unsigned cap, unsigned seq, u64 realmino, int flags,
struct ceph_cap **new_cap);
extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
+extern void __ceph_remove_caps(struct inode* inode);
extern void ceph_put_cap(struct ceph_mds_client *mdsc,
struct ceph_cap *cap);
extern int ceph_is_any_caps(struct inode *inode);
-extern void ceph_queue_caps_release(struct inode *inode);
extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
extern int ceph_fsync(struct file *file, loff_t start, loff_t end,
int datasync);
@@ -1049,10 +1070,10 @@ extern int ceph_handle_snapdir(struct ceph_mds_request *req,
extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
struct dentry *dentry, int err);
-extern void ceph_dentry_lru_add(struct dentry *dn);
-extern void ceph_dentry_lru_touch(struct dentry *dn);
-extern void ceph_dentry_lru_del(struct dentry *dn);
+extern void __ceph_dentry_lease_touch(struct ceph_dentry_info *di);
+extern void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di);
extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
+extern int ceph_trim_dentries(struct ceph_mds_client *mdsc);
extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn);
extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 316f6ad10644..0cc42c8879e9 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -228,8 +228,19 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
ci->i_rctime.tv_nsec);
}
-/* quotas */
+/* dir pin */
+static bool ceph_vxattrcb_dir_pin_exists(struct ceph_inode_info *ci)
+{
+ return ci->i_dir_pin != -ENODATA;
+}
+
+static size_t ceph_vxattrcb_dir_pin(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ return snprintf(val, size, "%d", (int)ci->i_dir_pin);
+}
+/* quotas */
static bool ceph_vxattrcb_quota_exists(struct ceph_inode_info *ci)
{
bool ret = false;
@@ -315,6 +326,13 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
XATTR_RSTAT_FIELD(dir, rbytes),
XATTR_RSTAT_FIELD(dir, rctime),
{
+ .name = "ceph.dir.pin",
+ .name_size = sizeof("ceph.dir_pin"),
+ .getxattr_cb = ceph_vxattrcb_dir_pin,
+ .exists_cb = ceph_vxattrcb_dir_pin_exists,
+ .flags = VXATTR_FLAG_HIDDEN,
+ },
+ {
.name = "ceph.quota",
.name_size = sizeof("ceph.quota"),
.getxattr_cb = ceph_vxattrcb_quota,
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index e92a2fee3c57..13c1288b04a7 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -115,7 +115,12 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon)
seq_puts(m, " type: CDROM ");
else
seq_printf(m, " type: %d ", dev_type);
- if (tcon->seal)
+
+ seq_printf(m, "Serial Number: 0x%x", tcon->vol_serial_number);
+
+ if ((tcon->seal) ||
+ (tcon->ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) ||
+ (tcon->share_flags & SHI1005_FLAGS_ENCRYPT_DATA))
seq_printf(m, " Encrypted");
if (tcon->nocase)
seq_printf(m, " nocase");
@@ -371,6 +376,10 @@ skip_rdma:
atomic_read(&server->in_send),
atomic_read(&server->num_waiters));
#endif
+ if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA)
+ seq_puts(m, " encrypted");
+ if (ses->sign)
+ seq_puts(m, " signed");
seq_puts(m, "\n\tShares:");
j = 0;
diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h
index d8bce2f862de..086ddc5108af 100644
--- a/fs/cifs/cifs_ioctl.h
+++ b/fs/cifs/cifs_ioctl.h
@@ -43,6 +43,9 @@ struct smb_snapshot_array {
/* snapshots[]; */
} __packed;
+/* query_info flags */
+#define PASSTHRU_QUERY_INFO 0x00000000
+#define PASSTHRU_FSCTL 0x00000001
struct smb_query_info {
__u32 info_type;
__u32 file_info_class;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index f293e052e351..38feae812b47 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -479,6 +479,14 @@ struct smb_version_operations {
struct cifs_tcon *tcon,
__le16 *path, int is_dir,
unsigned long p);
+ /* make unix special files (block, char, fifo, socket) */
+ int (*make_node)(unsigned int xid,
+ struct inode *inode,
+ struct dentry *dentry,
+ struct cifs_tcon *tcon,
+ char *full_path,
+ umode_t mode,
+ dev_t device_number);
};
struct smb_version_values {
@@ -735,13 +743,13 @@ in_flight(struct TCP_Server_Info *server)
}
static inline bool
-has_credits(struct TCP_Server_Info *server, int *credits)
+has_credits(struct TCP_Server_Info *server, int *credits, int num_credits)
{
int num;
spin_lock(&server->req_lock);
num = *credits;
spin_unlock(&server->req_lock);
- return num > 0;
+ return num >= num_credits;
}
static inline void
@@ -962,11 +970,14 @@ cap_unix(struct cifs_ses *ses)
struct cached_fid {
bool is_valid:1; /* Do we have a useable root fid */
+ bool file_all_info_is_valid:1;
+
struct kref refcount;
struct cifs_fid *fid;
struct mutex fid_mutex;
struct cifs_tcon *tcon;
struct work_struct lease_break;
+ struct smb2_file_all_info file_all_info;
};
/*
@@ -1735,6 +1746,7 @@ require use of the stronger protocol */
* GlobalMid_Lock protects:
* list operations on pending_mid_q and oplockQ
* updates to XID counters, multiplex id and SMB sequence numbers
+ * list operations on global DnotifyReqList
* tcp_ses_lock protects:
* list operations on tcp and SMB session lists
* tcon->open_file_lock protects the list of open files hanging off the tcon
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index b95db2b593cb..a8e9738db691 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1191,10 +1191,6 @@ next_pdu:
continue;
}
- if (server->large_buf)
- buf = server->bigbuf;
-
-
server->lstrp = jiffies;
for (i = 0; i < num_mids; i++) {
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 907e85d65bb4..f26a48dd2e39 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -621,20 +621,10 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
{
int rc = -EPERM;
unsigned int xid;
- int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL;
struct cifs_sb_info *cifs_sb;
struct tcon_link *tlink;
struct cifs_tcon *tcon;
- struct cifs_io_parms io_parms;
char *full_path = NULL;
- struct inode *newinode = NULL;
- __u32 oplock = 0;
- struct cifs_fid fid;
- struct cifs_open_parms oparms;
- FILE_ALL_INFO *buf = NULL;
- unsigned int bytes_written;
- struct win_dev *pdev;
- struct kvec iov[2];
if (!old_valid_dev(device_number))
return -EINVAL;
@@ -654,103 +644,12 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
goto mknod_out;
}
- if (tcon->unix_ext) {
- struct cifs_unix_set_info_args args = {
- .mode = mode & ~current_umask(),
- .ctime = NO_CHANGE_64,
- .atime = NO_CHANGE_64,
- .mtime = NO_CHANGE_64,
- .device = device_number,
- };
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
- args.uid = current_fsuid();
- args.gid = current_fsgid();
- } else {
- args.uid = INVALID_UID; /* no change */
- args.gid = INVALID_GID; /* no change */
- }
- rc = CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
- cifs_sb->local_nls,
- cifs_remap(cifs_sb));
- if (rc)
- goto mknod_out;
-
- rc = cifs_get_inode_info_unix(&newinode, full_path,
- inode->i_sb, xid);
-
- if (rc == 0)
- d_instantiate(direntry, newinode);
- goto mknod_out;
- }
-
- if (!S_ISCHR(mode) && !S_ISBLK(mode))
- goto mknod_out;
-
- if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
- goto mknod_out;
-
-
- cifs_dbg(FYI, "sfu compat create special file\n");
-
- buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
- if (buf == NULL) {
- rc = -ENOMEM;
- goto mknod_out;
- }
-
- if (backup_cred(cifs_sb))
- create_options |= CREATE_OPEN_BACKUP_INTENT;
-
- oparms.tcon = tcon;
- oparms.cifs_sb = cifs_sb;
- oparms.desired_access = GENERIC_WRITE;
- oparms.create_options = create_options;
- oparms.disposition = FILE_CREATE;
- oparms.path = full_path;
- oparms.fid = &fid;
- oparms.reconnect = false;
-
- if (tcon->ses->server->oplocks)
- oplock = REQ_OPLOCK;
- else
- oplock = 0;
- rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf);
- if (rc)
- goto mknod_out;
-
- /*
- * BB Do not bother to decode buf since no local inode yet to put
- * timestamps in, but we can reuse it safely.
- */
-
- pdev = (struct win_dev *)buf;
- io_parms.pid = current->tgid;
- io_parms.tcon = tcon;
- io_parms.offset = 0;
- io_parms.length = sizeof(struct win_dev);
- iov[1].iov_base = buf;
- iov[1].iov_len = sizeof(struct win_dev);
- if (S_ISCHR(mode)) {
- memcpy(pdev->type, "IntxCHR", 8);
- pdev->major = cpu_to_le64(MAJOR(device_number));
- pdev->minor = cpu_to_le64(MINOR(device_number));
- rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
- &bytes_written, iov, 1);
- } else if (S_ISBLK(mode)) {
- memcpy(pdev->type, "IntxBLK", 8);
- pdev->major = cpu_to_le64(MAJOR(device_number));
- pdev->minor = cpu_to_le64(MINOR(device_number));
- rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
- &bytes_written, iov, 1);
- }
- tcon->ses->server->ops->close(xid, tcon, &fid);
- d_drop(direntry);
-
- /* FIXME: add code here to set EAs */
+ rc = tcon->ses->server->ops->make_node(xid, inode, direntry, tcon,
+ full_path, mode,
+ device_number);
mknod_out:
kfree(full_path);
- kfree(buf);
free_xid(xid);
cifs_put_tlink(tlink);
return rc;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 4c144c1f50eb..2a6d20c0ce02 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1645,8 +1645,20 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
rc = server->ops->mand_unlock_range(cfile, flock, xid);
out:
- if (flock->fl_flags & FL_POSIX && !rc)
+ if (flock->fl_flags & FL_POSIX) {
+ /*
+ * If this is a request to remove all locks because we
+ * are closing the file, it doesn't matter if the
+ * unlocking failed as both cifs.ko and the SMB server
+ * remove the lock on file close
+ */
+ if (rc) {
+ cifs_dbg(VFS, "%s failed rc=%d\n", __func__, rc);
+ if (!(flock->fl_flags & FL_CLOSE))
+ return rc;
+ }
rc = locks_lock_file_wait(file, flock);
+ }
return rc;
}
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index f0ce27c3c6e4..c711f1f39bf2 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -1027,6 +1027,131 @@ cifs_can_echo(struct TCP_Server_Info *server)
return false;
}
+static int
+cifs_make_node(unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ char *full_path, umode_t mode, dev_t dev)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct inode *newinode = NULL;
+ int rc = -EPERM;
+ int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL;
+ FILE_ALL_INFO *buf = NULL;
+ struct cifs_io_parms io_parms;
+ __u32 oplock = 0;
+ struct cifs_fid fid;
+ struct cifs_open_parms oparms;
+ unsigned int bytes_written;
+ struct win_dev *pdev;
+ struct kvec iov[2];
+
+ if (tcon->unix_ext) {
+ /*
+ * SMB1 Unix Extensions: requires server support but
+ * works with all special files
+ */
+ struct cifs_unix_set_info_args args = {
+ .mode = mode & ~current_umask(),
+ .ctime = NO_CHANGE_64,
+ .atime = NO_CHANGE_64,
+ .mtime = NO_CHANGE_64,
+ .device = dev,
+ };
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
+ args.uid = current_fsuid();
+ args.gid = current_fsgid();
+ } else {
+ args.uid = INVALID_UID; /* no change */
+ args.gid = INVALID_GID; /* no change */
+ }
+ rc = CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
+ cifs_sb->local_nls,
+ cifs_remap(cifs_sb));
+ if (rc)
+ goto out;
+
+ rc = cifs_get_inode_info_unix(&newinode, full_path,
+ inode->i_sb, xid);
+
+ if (rc == 0)
+ d_instantiate(dentry, newinode);
+ goto out;
+ }
+
+ /*
+ * SMB1 SFU emulation: should work with all servers, but only
+ * support block and char device (no socket & fifo)
+ */
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
+ goto out;
+
+ if (!S_ISCHR(mode) && !S_ISBLK(mode))
+ goto out;
+
+ cifs_dbg(FYI, "sfu compat create special file\n");
+
+ buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+ if (buf == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ if (backup_cred(cifs_sb))
+ create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+ oparms.tcon = tcon;
+ oparms.cifs_sb = cifs_sb;
+ oparms.desired_access = GENERIC_WRITE;
+ oparms.create_options = create_options;
+ oparms.disposition = FILE_CREATE;
+ oparms.path = full_path;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ if (tcon->ses->server->oplocks)
+ oplock = REQ_OPLOCK;
+ else
+ oplock = 0;
+ rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf);
+ if (rc)
+ goto out;
+
+ /*
+ * BB Do not bother to decode buf since no local inode yet to put
+ * timestamps in, but we can reuse it safely.
+ */
+
+ pdev = (struct win_dev *)buf;
+ io_parms.pid = current->tgid;
+ io_parms.tcon = tcon;
+ io_parms.offset = 0;
+ io_parms.length = sizeof(struct win_dev);
+ iov[1].iov_base = buf;
+ iov[1].iov_len = sizeof(struct win_dev);
+ if (S_ISCHR(mode)) {
+ memcpy(pdev->type, "IntxCHR", 8);
+ pdev->major = cpu_to_le64(MAJOR(dev));
+ pdev->minor = cpu_to_le64(MINOR(dev));
+ rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
+ &bytes_written, iov, 1);
+ } else if (S_ISBLK(mode)) {
+ memcpy(pdev->type, "IntxBLK", 8);
+ pdev->major = cpu_to_le64(MAJOR(dev));
+ pdev->minor = cpu_to_le64(MINOR(dev));
+ rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
+ &bytes_written, iov, 1);
+ }
+ tcon->ses->server->ops->close(xid, tcon, &fid);
+ d_drop(dentry);
+
+ /* FIXME: add code here to set EAs */
+out:
+ kfree(buf);
+ return rc;
+}
+
+
+
struct smb_version_operations smb1_operations = {
.send_cancel = send_nt_cancel,
.compare_fids = cifs_compare_fids,
@@ -1110,6 +1235,7 @@ struct smb_version_operations smb1_operations = {
.get_acl_by_fid = get_cifs_acl_by_fid,
.set_acl = set_cifs_acl,
#endif /* CIFS_ACL */
+ .make_node = cifs_make_node,
};
struct smb_version_values smb1_values = {
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 01a76bccdb8d..278405d26c47 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -37,6 +37,16 @@
#include "smb2pdu.h"
#include "smb2proto.h"
+static void
+free_set_inf_compound(struct smb_rqst *rqst)
+{
+ if (rqst[1].rq_iov)
+ SMB2_set_info_free(&rqst[1]);
+ if (rqst[2].rq_iov)
+ SMB2_close_free(&rqst[2]);
+}
+
+
static int
smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path,
@@ -112,14 +122,18 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
PATH_MAX * 2, 0, NULL);
smb2_set_next_command(tcon, &rqst[num_rqst]);
smb2_set_related(&rqst[num_rqst++]);
+ trace_smb3_query_info_compound_enter(xid, ses->Suid, tcon->tid,
+ full_path);
break;
case SMB2_OP_DELETE:
+ trace_smb3_delete_enter(xid, ses->Suid, tcon->tid, full_path);
break;
case SMB2_OP_MKDIR:
/*
* Directories are created through parameters in the
* SMB2_open() call.
*/
+ trace_smb3_mkdir_enter(xid, ses->Suid, tcon->tid, full_path);
break;
case SMB2_OP_RMDIR:
memset(&si_iov, 0, sizeof(si_iov));
@@ -135,6 +149,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
SMB2_O_INFO_FILE, 0, data, size);
smb2_set_next_command(tcon, &rqst[num_rqst]);
smb2_set_related(&rqst[num_rqst++]);
+ trace_smb3_rmdir_enter(xid, ses->Suid, tcon->tid, full_path);
break;
case SMB2_OP_SET_EOF:
memset(&si_iov, 0, sizeof(si_iov));
@@ -150,6 +165,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
SMB2_O_INFO_FILE, 0, data, size);
smb2_set_next_command(tcon, &rqst[num_rqst]);
smb2_set_related(&rqst[num_rqst++]);
+ trace_smb3_set_eof_enter(xid, ses->Suid, tcon->tid, full_path);
break;
case SMB2_OP_SET_INFO:
memset(&si_iov, 0, sizeof(si_iov));
@@ -166,6 +182,8 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
SMB2_O_INFO_FILE, 0, data, size);
smb2_set_next_command(tcon, &rqst[num_rqst]);
smb2_set_related(&rqst[num_rqst++]);
+ trace_smb3_set_info_compound_enter(xid, ses->Suid, tcon->tid,
+ full_path);
break;
case SMB2_OP_RENAME:
memset(&si_iov, 0, sizeof(si_iov));
@@ -190,6 +208,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
SMB2_O_INFO_FILE, 0, data, size);
smb2_set_next_command(tcon, &rqst[num_rqst]);
smb2_set_related(&rqst[num_rqst++]);
+ trace_smb3_rename_enter(xid, ses->Suid, tcon->tid, full_path);
break;
case SMB2_OP_HARDLINK:
memset(&si_iov, 0, sizeof(si_iov));
@@ -214,6 +233,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
SMB2_O_INFO_FILE, 0, data, size);
smb2_set_next_command(tcon, &rqst[num_rqst]);
smb2_set_related(&rqst[num_rqst++]);
+ trace_smb3_hardlink_enter(xid, ses->Suid, tcon->tid, full_path);
break;
default:
cifs_dbg(VFS, "Invalid command\n");
@@ -252,21 +272,65 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
SMB2_query_info_free(&rqst[1]);
if (rqst[2].rq_iov)
SMB2_close_free(&rqst[2]);
+ if (rc)
+ trace_smb3_query_info_compound_err(xid, ses->Suid,
+ tcon->tid, rc);
+ else
+ trace_smb3_query_info_compound_done(xid, ses->Suid,
+ tcon->tid);
break;
case SMB2_OP_DELETE:
+ if (rc)
+ trace_smb3_delete_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_delete_done(xid, ses->Suid, tcon->tid);
+ if (rqst[1].rq_iov)
+ SMB2_close_free(&rqst[1]);
+ break;
case SMB2_OP_MKDIR:
+ if (rc)
+ trace_smb3_mkdir_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_mkdir_done(xid, ses->Suid, tcon->tid);
if (rqst[1].rq_iov)
SMB2_close_free(&rqst[1]);
break;
case SMB2_OP_HARDLINK:
+ if (rc)
+ trace_smb3_hardlink_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_hardlink_done(xid, ses->Suid, tcon->tid);
+ free_set_inf_compound(rqst);
+ break;
case SMB2_OP_RENAME:
+ if (rc)
+ trace_smb3_rename_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_rename_done(xid, ses->Suid, tcon->tid);
+ free_set_inf_compound(rqst);
+ break;
case SMB2_OP_RMDIR:
+ if (rc)
+ trace_smb3_rmdir_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_rmdir_done(xid, ses->Suid, tcon->tid);
+ free_set_inf_compound(rqst);
+ break;
case SMB2_OP_SET_EOF:
+ if (rc)
+ trace_smb3_set_eof_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_set_eof_done(xid, ses->Suid, tcon->tid);
+ free_set_inf_compound(rqst);
+ break;
case SMB2_OP_SET_INFO:
- if (rqst[1].rq_iov)
- SMB2_set_info_free(&rqst[1]);
- if (rqst[2].rq_iov)
- SMB2_close_free(&rqst[2]);
+ if (rc)
+ trace_smb3_set_info_compound_err(xid, ses->Suid,
+ tcon->tid, rc);
+ else
+ trace_smb3_set_info_compound_done(xid, ses->Suid,
+ tcon->tid);
+ free_set_inf_compound(rqst);
break;
}
free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
@@ -309,12 +373,17 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
rc = open_shroot(xid, tcon, &fid);
if (rc)
goto out;
- rc = SMB2_query_info(xid, tcon, fid.persistent_fid,
- fid.volatile_fid, smb2_data);
+
+ if (tcon->crfid.file_all_info_is_valid) {
+ move_smb2_info_to_cifs(data,
+ &tcon->crfid.file_all_info);
+ } else {
+ rc = SMB2_query_info(xid, tcon, fid.persistent_fid,
+ fid.volatile_fid, smb2_data);
+ if (!rc)
+ move_smb2_info_to_cifs(data, smb2_data);
+ }
close_shroot(&tcon->crfid);
- if (rc)
- goto out;
- move_smb2_info_to_cifs(data, smb2_data);
goto out;
}
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 085e91436da7..1022a3771e14 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -185,7 +185,7 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
spin_unlock(&server->req_lock);
cifs_num_waiters_inc(server);
rc = wait_event_killable(server->request_q,
- has_credits(server, &server->credits));
+ has_credits(server, &server->credits, 1));
cifs_num_waiters_dec(server);
if (rc)
return rc;
@@ -619,6 +619,7 @@ smb2_close_cached_fid(struct kref *ref)
SMB2_close(0, cfid->tcon, cfid->fid->persistent_fid,
cfid->fid->volatile_fid);
cfid->is_valid = false;
+ cfid->file_all_info_is_valid = false;
}
}
@@ -643,9 +644,18 @@ smb2_cached_lease_break(struct work_struct *work)
*/
int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid)
{
- struct cifs_open_parms oparams;
- int rc;
- __le16 srch_path = 0; /* Null - since an open of top of share */
+ struct cifs_ses *ses = tcon->ses;
+ struct TCP_Server_Info *server = ses->server;
+ struct cifs_open_parms oparms;
+ struct smb2_create_rsp *o_rsp = NULL;
+ struct smb2_query_info_rsp *qi_rsp = NULL;
+ int resp_buftype[2];
+ struct smb_rqst rqst[2];
+ struct kvec rsp_iov[2];
+ struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
+ struct kvec qi_iov[1];
+ int rc, flags = 0;
+ __le16 utf16_path = 0; /* Null - since an open of top of share */
u8 oplock = SMB2_OPLOCK_LEVEL_II;
mutex_lock(&tcon->crfid.fid_mutex);
@@ -657,22 +667,89 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid)
return 0;
}
- oparams.tcon = tcon;
- oparams.create_options = 0;
- oparams.desired_access = FILE_READ_ATTRIBUTES;
- oparams.disposition = FILE_OPEN;
- oparams.fid = pfid;
- oparams.reconnect = false;
-
- rc = SMB2_open(xid, &oparams, &srch_path, &oplock, NULL, NULL, NULL);
- if (rc == 0) {
- memcpy(tcon->crfid.fid, pfid, sizeof(struct cifs_fid));
- tcon->crfid.tcon = tcon;
- tcon->crfid.is_valid = true;
- kref_init(&tcon->crfid.refcount);
- kref_get(&tcon->crfid.refcount);
- }
+ if (smb3_encryption_required(tcon))
+ flags |= CIFS_TRANSFORM_REQ;
+
+ memset(rqst, 0, sizeof(rqst));
+ resp_buftype[0] = resp_buftype[1] = CIFS_NO_BUFFER;
+ memset(rsp_iov, 0, sizeof(rsp_iov));
+
+ /* Open */
+ memset(&open_iov, 0, sizeof(open_iov));
+ rqst[0].rq_iov = open_iov;
+ rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
+
+ oparms.tcon = tcon;
+ oparms.create_options = 0;
+ oparms.desired_access = FILE_READ_ATTRIBUTES;
+ oparms.disposition = FILE_OPEN;
+ oparms.fid = pfid;
+ oparms.reconnect = false;
+
+ rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, &utf16_path);
+ if (rc)
+ goto oshr_exit;
+ smb2_set_next_command(tcon, &rqst[0]);
+
+ memset(&qi_iov, 0, sizeof(qi_iov));
+ rqst[1].rq_iov = qi_iov;
+ rqst[1].rq_nvec = 1;
+
+ rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID,
+ COMPOUND_FID, FILE_ALL_INFORMATION,
+ SMB2_O_INFO_FILE, 0,
+ sizeof(struct smb2_file_all_info) +
+ PATH_MAX * 2, 0, NULL);
+ if (rc)
+ goto oshr_exit;
+
+ smb2_set_related(&rqst[1]);
+
+ rc = compound_send_recv(xid, ses, flags, 2, rqst,
+ resp_buftype, rsp_iov);
+ if (rc)
+ goto oshr_exit;
+
+ o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base;
+ oparms.fid->persistent_fid = o_rsp->PersistentFileId;
+ oparms.fid->volatile_fid = o_rsp->VolatileFileId;
+#ifdef CONFIG_CIFS_DEBUG2
+ oparms.fid->mid = le64_to_cpu(o_rsp->sync_hdr.MessageId);
+#endif /* CIFS_DEBUG2 */
+
+ if (o_rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE)
+ oplock = smb2_parse_lease_state(server, o_rsp,
+ &oparms.fid->epoch,
+ oparms.fid->lease_key);
+ else
+ goto oshr_exit;
+
+
+ memcpy(tcon->crfid.fid, pfid, sizeof(struct cifs_fid));
+ tcon->crfid.tcon = tcon;
+ tcon->crfid.is_valid = true;
+ kref_init(&tcon->crfid.refcount);
+ kref_get(&tcon->crfid.refcount);
+
+
+ qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
+ if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info))
+ goto oshr_exit;
+ rc = smb2_validate_and_copy_iov(
+ le16_to_cpu(qi_rsp->OutputBufferOffset),
+ sizeof(struct smb2_file_all_info),
+ &rsp_iov[1], sizeof(struct smb2_file_all_info),
+ (char *)&tcon->crfid.file_all_info);
+ if (rc)
+ goto oshr_exit;
+ tcon->crfid.file_all_info_is_valid = 1;
+
+ oshr_exit:
mutex_unlock(&tcon->crfid.fid_mutex);
+ SMB2_open_free(&rqst[0]);
+ SMB2_query_info_free(&rqst[1]);
+ free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
+ free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
return rc;
}
@@ -1253,7 +1330,8 @@ smb2_ioctl_query_info(const unsigned int xid,
struct smb_query_info __user *pqi;
int rc = 0;
int flags = 0;
- struct smb2_query_info_rsp *rsp = NULL;
+ struct smb2_query_info_rsp *qi_rsp = NULL;
+ struct smb2_ioctl_rsp *io_rsp = NULL;
void *buffer = NULL;
struct smb_rqst rqst[3];
int resp_buftype[3];
@@ -1263,6 +1341,7 @@ smb2_ioctl_query_info(const unsigned int xid,
u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
struct cifs_fid fid;
struct kvec qi_iov[1];
+ struct kvec io_iov[SMB2_IOCTL_IOV_SIZE];
struct kvec close_iov[1];
memset(rqst, 0, sizeof(rqst));
@@ -1313,15 +1392,35 @@ smb2_ioctl_query_info(const unsigned int xid,
smb2_set_next_command(tcon, &rqst[0]);
/* Query */
- memset(&qi_iov, 0, sizeof(qi_iov));
- rqst[1].rq_iov = qi_iov;
- rqst[1].rq_nvec = 1;
-
- rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID, COMPOUND_FID,
- qi.file_info_class, qi.info_type,
- qi.additional_information,
+ if (qi.flags & PASSTHRU_FSCTL) {
+ /* Can eventually relax perm check since server enforces too */
+ if (!capable(CAP_SYS_ADMIN))
+ rc = -EPERM;
+ else {
+ memset(&io_iov, 0, sizeof(io_iov));
+ rqst[1].rq_iov = io_iov;
+ rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE;
+
+ rc = SMB2_ioctl_init(tcon, &rqst[1],
+ COMPOUND_FID, COMPOUND_FID,
+ qi.info_type, true, NULL,
+ 0);
+ }
+ } else if (qi.flags == PASSTHRU_QUERY_INFO) {
+ memset(&qi_iov, 0, sizeof(qi_iov));
+ rqst[1].rq_iov = qi_iov;
+ rqst[1].rq_nvec = 1;
+
+ rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID,
+ COMPOUND_FID, qi.file_info_class,
+ qi.info_type, qi.additional_information,
qi.input_buffer_length,
qi.output_buffer_length, buffer);
+ } else { /* unknown flags */
+ cifs_dbg(VFS, "invalid passthru query flags: 0x%x\n", qi.flags);
+ rc = -EINVAL;
+ }
+
if (rc)
goto iqinf_exit;
smb2_set_next_command(tcon, &rqst[1]);
@@ -1341,24 +1440,44 @@ smb2_ioctl_query_info(const unsigned int xid,
resp_buftype, rsp_iov);
if (rc)
goto iqinf_exit;
- pqi = (struct smb_query_info __user *)arg;
- rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
- if (le32_to_cpu(rsp->OutputBufferLength) < qi.input_buffer_length)
- qi.input_buffer_length = le32_to_cpu(rsp->OutputBufferLength);
- if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length,
- sizeof(qi.input_buffer_length))) {
- rc = -EFAULT;
- goto iqinf_exit;
- }
- if (copy_to_user(pqi + 1, rsp->Buffer, qi.input_buffer_length)) {
- rc = -EFAULT;
- goto iqinf_exit;
+ if (qi.flags & PASSTHRU_FSCTL) {
+ pqi = (struct smb_query_info __user *)arg;
+ io_rsp = (struct smb2_ioctl_rsp *)rsp_iov[1].iov_base;
+ if (le32_to_cpu(io_rsp->OutputCount) < qi.input_buffer_length)
+ qi.input_buffer_length = le32_to_cpu(io_rsp->OutputCount);
+ if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length,
+ sizeof(qi.input_buffer_length))) {
+ rc = -EFAULT;
+ goto iqinf_exit;
+ }
+ if (copy_to_user(pqi + 1, &io_rsp[1], qi.input_buffer_length)) {
+ rc = -EFAULT;
+ goto iqinf_exit;
+ }
+ } else {
+ pqi = (struct smb_query_info __user *)arg;
+ qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
+ if (le32_to_cpu(qi_rsp->OutputBufferLength) < qi.input_buffer_length)
+ qi.input_buffer_length = le32_to_cpu(qi_rsp->OutputBufferLength);
+ if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length,
+ sizeof(qi.input_buffer_length))) {
+ rc = -EFAULT;
+ goto iqinf_exit;
+ }
+ if (copy_to_user(pqi + 1, qi_rsp->Buffer, qi.input_buffer_length)) {
+ rc = -EFAULT;
+ goto iqinf_exit;
+ }
}
iqinf_exit:
kfree(buffer);
SMB2_open_free(&rqst[0]);
- SMB2_query_info_free(&rqst[1]);
+ if (qi.flags & PASSTHRU_FSCTL)
+ SMB2_ioctl_free(&rqst[1]);
+ else
+ SMB2_query_info_free(&rqst[1]);
+
SMB2_close_free(&rqst[2]);
free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
@@ -2472,22 +2591,38 @@ get_smb2_acl(struct cifs_sb_info *cifs_sb,
static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
loff_t offset, loff_t len, bool keep_size)
{
+ struct cifs_ses *ses = tcon->ses;
struct inode *inode;
struct cifsInodeInfo *cifsi;
struct cifsFileInfo *cfile = file->private_data;
struct file_zero_data_information fsctl_buf;
+ struct smb_rqst rqst[2];
+ int resp_buftype[2];
+ struct kvec rsp_iov[2];
+ struct kvec io_iov[SMB2_IOCTL_IOV_SIZE];
+ struct kvec si_iov[1];
+ unsigned int size[1];
+ void *data[1];
long rc;
unsigned int xid;
+ int num = 0, flags = 0;
+ __le64 eof;
xid = get_xid();
inode = d_inode(cfile->dentry);
cifsi = CIFS_I(inode);
+ trace_smb3_zero_enter(xid, cfile->fid.persistent_fid, tcon->tid,
+ ses->Suid, offset, len);
+
+
/* if file not oplocked can't be sure whether asking to extend size */
if (!CIFS_CACHE_READ(cifsi))
if (keep_size == false) {
rc = -EOPNOTSUPP;
+ trace_smb3_zero_err(xid, cfile->fid.persistent_fid,
+ tcon->tid, ses->Suid, offset, len, rc);
free_xid(xid);
return rc;
}
@@ -2498,33 +2633,73 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
*/
if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE)) {
rc = -EOPNOTSUPP;
+ trace_smb3_zero_err(xid, cfile->fid.persistent_fid, tcon->tid,
+ ses->Suid, offset, len, rc);
free_xid(xid);
return rc;
}
- /*
- * need to make sure we are not asked to extend the file since the SMB3
- * fsctl does not change the file size. In the future we could change
- * this to zero the first part of the range then set the file size
- * which for a non sparse file would zero the newly extended range
- */
- if (keep_size == false)
- if (i_size_read(inode) < offset + len) {
- rc = -EOPNOTSUPP;
- free_xid(xid);
- return rc;
- }
-
cifs_dbg(FYI, "offset %lld len %lld", offset, len);
fsctl_buf.FileOffset = cpu_to_le64(offset);
fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len);
- rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
- cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA,
- true /* is_fctl */, (char *)&fsctl_buf,
- sizeof(struct file_zero_data_information), NULL, NULL);
+ if (smb3_encryption_required(tcon))
+ flags |= CIFS_TRANSFORM_REQ;
+
+ memset(rqst, 0, sizeof(rqst));
+ resp_buftype[0] = resp_buftype[1] = CIFS_NO_BUFFER;
+ memset(rsp_iov, 0, sizeof(rsp_iov));
+
+
+ memset(&io_iov, 0, sizeof(io_iov));
+ rqst[num].rq_iov = io_iov;
+ rqst[num].rq_nvec = SMB2_IOCTL_IOV_SIZE;
+ rc = SMB2_ioctl_init(tcon, &rqst[num++], cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA,
+ true /* is_fctl */, (char *)&fsctl_buf,
+ sizeof(struct file_zero_data_information));
+ if (rc)
+ goto zero_range_exit;
+
+ /*
+ * do we also need to change the size of the file?
+ */
+ if (keep_size == false && i_size_read(inode) < offset + len) {
+ smb2_set_next_command(tcon, &rqst[0]);
+
+ memset(&si_iov, 0, sizeof(si_iov));
+ rqst[num].rq_iov = si_iov;
+ rqst[num].rq_nvec = 1;
+
+ eof = cpu_to_le64(offset + len);
+ size[0] = 8; /* sizeof __le64 */
+ data[0] = &eof;
+
+ rc = SMB2_set_info_init(tcon, &rqst[num++],
+ cfile->fid.persistent_fid,
+ cfile->fid.persistent_fid,
+ current->tgid,
+ FILE_END_OF_FILE_INFORMATION,
+ SMB2_O_INFO_FILE, 0, data, size);
+ smb2_set_related(&rqst[1]);
+ }
+
+ rc = compound_send_recv(xid, ses, flags, num, rqst,
+ resp_buftype, rsp_iov);
+
+ zero_range_exit:
+ SMB2_ioctl_free(&rqst[0]);
+ SMB2_set_info_free(&rqst[1]);
+ free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
+ free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
free_xid(xid);
+ if (rc)
+ trace_smb3_zero_err(xid, cfile->fid.persistent_fid, tcon->tid,
+ ses->Suid, offset, len, rc);
+ else
+ trace_smb3_zero_done(xid, cfile->fid.persistent_fid, tcon->tid,
+ ses->Suid, offset, len);
return rc;
}
@@ -2573,15 +2748,20 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
struct cifsFileInfo *cfile = file->private_data;
long rc = -EOPNOTSUPP;
unsigned int xid;
+ __le64 eof;
xid = get_xid();
inode = d_inode(cfile->dentry);
cifsi = CIFS_I(inode);
+ trace_smb3_falloc_enter(xid, cfile->fid.persistent_fid, tcon->tid,
+ tcon->ses->Suid, off, len);
/* if file not oplocked can't be sure whether asking to extend size */
if (!CIFS_CACHE_READ(cifsi))
if (keep_size == false) {
+ trace_smb3_falloc_err(xid, cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid, off, len, rc);
free_xid(xid);
return rc;
}
@@ -2601,6 +2781,12 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
/* BB: in future add else clause to extend file */
else
rc = -EOPNOTSUPP;
+ if (rc)
+ trace_smb3_falloc_err(xid, cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid, off, len, rc);
+ else
+ trace_smb3_falloc_done(xid, cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid, off, len);
free_xid(xid);
return rc;
}
@@ -2616,14 +2802,31 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
*/
if ((off > 8192) || (off + len + 8192 < i_size_read(inode))) {
rc = -EOPNOTSUPP;
+ trace_smb3_falloc_err(xid, cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid, off, len, rc);
free_xid(xid);
return rc;
}
- rc = smb2_set_sparse(xid, tcon, cfile, inode, false);
+ smb2_set_sparse(xid, tcon, cfile, inode, false);
+ rc = 0;
+ } else {
+ smb2_set_sparse(xid, tcon, cfile, inode, false);
+ rc = 0;
+ if (i_size_read(inode) < off + len) {
+ eof = cpu_to_le64(off + len);
+ rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid, cfile->pid,
+ &eof);
+ }
}
- /* BB: else ... in future add code to extend file and set sparse */
+ if (rc)
+ trace_smb3_falloc_err(xid, cfile->fid.persistent_fid, tcon->tid,
+ tcon->ses->Suid, off, len, rc);
+ else
+ trace_smb3_falloc_done(xid, cfile->fid.persistent_fid, tcon->tid,
+ tcon->ses->Suid, off, len);
free_xid(xid);
return rc;
@@ -3604,6 +3807,104 @@ smb2_next_header(char *buf)
return le32_to_cpu(hdr->NextCommand);
}
+static int
+smb2_make_node(unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ char *full_path, umode_t mode, dev_t dev)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ int rc = -EPERM;
+ int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL;
+ FILE_ALL_INFO *buf = NULL;
+ struct cifs_io_parms io_parms;
+ __u32 oplock = 0;
+ struct cifs_fid fid;
+ struct cifs_open_parms oparms;
+ unsigned int bytes_written;
+ struct win_dev *pdev;
+ struct kvec iov[2];
+
+ /*
+ * Check if mounted with mount parm 'sfu' mount parm.
+ * SFU emulation should work with all servers, but only
+ * supports block and char device (no socket & fifo),
+ * and was used by default in earlier versions of Windows
+ */
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
+ goto out;
+
+ /*
+ * TODO: Add ability to create instead via reparse point. Windows (e.g.
+ * their current NFS server) uses this approach to expose special files
+ * over SMB2/SMB3 and Samba will do this with SMB3.1.1 POSIX Extensions
+ */
+
+ if (!S_ISCHR(mode) && !S_ISBLK(mode))
+ goto out;
+
+ cifs_dbg(FYI, "sfu compat create special file\n");
+
+ buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+ if (buf == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ if (backup_cred(cifs_sb))
+ create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+ oparms.tcon = tcon;
+ oparms.cifs_sb = cifs_sb;
+ oparms.desired_access = GENERIC_WRITE;
+ oparms.create_options = create_options;
+ oparms.disposition = FILE_CREATE;
+ oparms.path = full_path;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ if (tcon->ses->server->oplocks)
+ oplock = REQ_OPLOCK;
+ else
+ oplock = 0;
+ rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf);
+ if (rc)
+ goto out;
+
+ /*
+ * BB Do not bother to decode buf since no local inode yet to put
+ * timestamps in, but we can reuse it safely.
+ */
+
+ pdev = (struct win_dev *)buf;
+ io_parms.pid = current->tgid;
+ io_parms.tcon = tcon;
+ io_parms.offset = 0;
+ io_parms.length = sizeof(struct win_dev);
+ iov[1].iov_base = buf;
+ iov[1].iov_len = sizeof(struct win_dev);
+ if (S_ISCHR(mode)) {
+ memcpy(pdev->type, "IntxCHR", 8);
+ pdev->major = cpu_to_le64(MAJOR(dev));
+ pdev->minor = cpu_to_le64(MINOR(dev));
+ rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
+ &bytes_written, iov, 1);
+ } else if (S_ISBLK(mode)) {
+ memcpy(pdev->type, "IntxBLK", 8);
+ pdev->major = cpu_to_le64(MAJOR(dev));
+ pdev->minor = cpu_to_le64(MINOR(dev));
+ rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
+ &bytes_written, iov, 1);
+ }
+ tcon->ses->server->ops->close(xid, tcon, &fid);
+ d_drop(dentry);
+
+ /* FIXME: add code here to set EAs */
+out:
+ kfree(buf);
+ return rc;
+}
+
+
struct smb_version_operations smb20_operations = {
.compare_fids = smb2_compare_fids,
.setup_request = smb2_setup_request,
@@ -3698,6 +3999,7 @@ struct smb_version_operations smb20_operations = {
#endif /* CIFS_ACL */
.next_header = smb2_next_header,
.ioctl_query_info = smb2_ioctl_query_info,
+ .make_node = smb2_make_node,
};
struct smb_version_operations smb21_operations = {
@@ -3796,6 +4098,7 @@ struct smb_version_operations smb21_operations = {
#endif /* CIFS_ACL */
.next_header = smb2_next_header,
.ioctl_query_info = smb2_ioctl_query_info,
+ .make_node = smb2_make_node,
};
struct smb_version_operations smb30_operations = {
@@ -3903,6 +4206,7 @@ struct smb_version_operations smb30_operations = {
#endif /* CIFS_ACL */
.next_header = smb2_next_header,
.ioctl_query_info = smb2_ioctl_query_info,
+ .make_node = smb2_make_node,
};
struct smb_version_operations smb311_operations = {
@@ -4011,6 +4315,7 @@ struct smb_version_operations smb311_operations = {
#endif /* CIFS_ACL */
.next_header = smb2_next_header,
.ioctl_query_info = smb2_ioctl_query_info,
+ .make_node = smb2_make_node,
};
struct smb_version_values smb20_values = {
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 60fbe306f604..c399e09b76e6 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1797,9 +1797,10 @@ create_reconnect_durable_buf(struct cifs_fid *fid)
return buf;
}
-static __u8
-parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp,
- unsigned int *epoch, char *lease_key)
+__u8
+smb2_parse_lease_state(struct TCP_Server_Info *server,
+ struct smb2_create_rsp *rsp,
+ unsigned int *epoch, char *lease_key)
{
char *data_offset;
struct create_context *cc;
@@ -2456,8 +2457,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
}
if (rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE)
- *oplock = parse_lease_state(server, rsp, &oparms->fid->epoch,
- oparms->fid->lease_key);
+ *oplock = smb2_parse_lease_state(server, rsp,
+ &oparms->fid->epoch,
+ oparms->fid->lease_key);
else
*oplock = rsp->OplockLevel;
creat_exit:
@@ -2466,65 +2468,46 @@ creat_exit:
return rc;
}
-/*
- * SMB2 IOCTL is used for both IOCTLs and FSCTLs
- */
int
-SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
- u64 volatile_fid, u32 opcode, bool is_fsctl,
- char *in_data, u32 indatalen,
- char **out_data, u32 *plen /* returned data len */)
+SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
+ u64 persistent_fid, u64 volatile_fid, u32 opcode,
+ bool is_fsctl, char *in_data, u32 indatalen)
{
- struct smb_rqst rqst;
struct smb2_ioctl_req *req;
- struct smb2_ioctl_rsp *rsp;
- struct cifs_ses *ses;
- struct kvec iov[2];
- struct kvec rsp_iov;
- int resp_buftype;
- int n_iov;
- int rc = 0;
- int flags = 0;
+ struct kvec *iov = rqst->rq_iov;
unsigned int total_len;
-
- cifs_dbg(FYI, "SMB2 IOCTL\n");
-
- if (out_data != NULL)
- *out_data = NULL;
-
- /* zero out returned data len, in case of error */
- if (plen)
- *plen = 0;
-
- if (tcon)
- ses = tcon->ses;
- else
- return -EIO;
-
- if (!ses || !(ses->server))
- return -EIO;
+ int rc;
rc = smb2_plain_req_init(SMB2_IOCTL, tcon, (void **) &req, &total_len);
if (rc)
return rc;
- if (smb3_encryption_required(tcon))
- flags |= CIFS_TRANSFORM_REQ;
-
req->CtlCode = cpu_to_le32(opcode);
req->PersistentFileId = persistent_fid;
req->VolatileFileId = volatile_fid;
+ iov[0].iov_base = (char *)req;
+ /*
+ * If no input data, the size of ioctl struct in
+ * protocol spec still includes a 1 byte data buffer,
+ * but if input data passed to ioctl, we do not
+ * want to double count this, so we do not send
+ * the dummy one byte of data in iovec[0] if sending
+ * input data (in iovec[1]).
+ */
if (indatalen) {
req->InputCount = cpu_to_le32(indatalen);
/* do not set InputOffset if no input data */
req->InputOffset =
cpu_to_le32(offsetof(struct smb2_ioctl_req, Buffer));
+ rqst->rq_nvec = 2;
+ iov[0].iov_len = total_len - 1;
iov[1].iov_base = in_data;
iov[1].iov_len = indatalen;
- n_iov = 2;
- } else
- n_iov = 1;
+ } else {
+ rqst->rq_nvec = 1;
+ iov[0].iov_len = total_len;
+ }
req->OutputOffset = 0;
req->OutputCount = 0; /* MBZ */
@@ -2546,33 +2529,70 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
else
req->Flags = 0;
- iov[0].iov_base = (char *)req;
-
- /*
- * If no input data, the size of ioctl struct in
- * protocol spec still includes a 1 byte data buffer,
- * but if input data passed to ioctl, we do not
- * want to double count this, so we do not send
- * the dummy one byte of data in iovec[0] if sending
- * input data (in iovec[1]).
- */
-
- if (indatalen) {
- iov[0].iov_len = total_len - 1;
- } else
- iov[0].iov_len = total_len;
-
/* validate negotiate request must be signed - see MS-SMB2 3.2.5.5 */
if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO)
req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
+ return 0;
+}
+
+void
+SMB2_ioctl_free(struct smb_rqst *rqst)
+{
+ if (rqst && rqst->rq_iov)
+ cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */
+}
+
+/*
+ * SMB2 IOCTL is used for both IOCTLs and FSCTLs
+ */
+int
+SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
+ u64 volatile_fid, u32 opcode, bool is_fsctl,
+ char *in_data, u32 indatalen,
+ char **out_data, u32 *plen /* returned data len */)
+{
+ struct smb_rqst rqst;
+ struct smb2_ioctl_rsp *rsp = NULL;
+ struct cifs_ses *ses;
+ struct kvec iov[SMB2_IOCTL_IOV_SIZE];
+ struct kvec rsp_iov = {NULL, 0};
+ int resp_buftype = CIFS_NO_BUFFER;
+ int rc = 0;
+ int flags = 0;
+
+ cifs_dbg(FYI, "SMB2 IOCTL\n");
+
+ if (out_data != NULL)
+ *out_data = NULL;
+
+ /* zero out returned data len, in case of error */
+ if (plen)
+ *plen = 0;
+
+ if (tcon)
+ ses = tcon->ses;
+ else
+ return -EIO;
+
+ if (!ses || !(ses->server))
+ return -EIO;
+
+ if (smb3_encryption_required(tcon))
+ flags |= CIFS_TRANSFORM_REQ;
+
memset(&rqst, 0, sizeof(struct smb_rqst));
+ memset(&iov, 0, sizeof(iov));
rqst.rq_iov = iov;
- rqst.rq_nvec = n_iov;
+ rqst.rq_nvec = SMB2_IOCTL_IOV_SIZE;
+
+ rc = SMB2_ioctl_init(tcon, &rqst, persistent_fid, volatile_fid,
+ opcode, is_fsctl, in_data, indatalen);
+ if (rc)
+ goto ioctl_exit;
rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags,
&rsp_iov);
- cifs_small_buf_release(req);
rsp = (struct smb2_ioctl_rsp *)rsp_iov.iov_base;
if (rc != 0)
@@ -2622,6 +2642,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
}
ioctl_exit:
+ SMB2_ioctl_free(&rqst);
free_rsp_buf(resp_buftype, rsp);
return rc;
}
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 0bd4d4802701..ee8977688e21 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -959,6 +959,13 @@ struct duplicate_extents_to_file {
__le64 ByteCount; /* Bytes to be copied */
} __packed;
+/*
+ * Maximum number of iovs we need for an ioctl request.
+ * [0] : struct smb2_ioctl_req
+ * [1] : in_data
+ */
+#define SMB2_IOCTL_IOV_SIZE 2
+
struct smb2_ioctl_req {
struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 57 */
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 87733b27a65f..3c32d0cfea69 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -144,6 +144,10 @@ extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, u32 opcode,
bool is_fsctl, char *in_data, u32 indatalen,
char **out_data, u32 *plen /* returned data len */);
+extern int SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
+ u64 persistent_fid, u64 volatile_fid, u32 opcode,
+ bool is_fsctl, char *in_data, u32 indatalen);
+extern void SMB2_ioctl_free(struct smb_rqst *rqst);
extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_file_id, u64 volatile_file_id);
extern int SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon,
@@ -223,6 +227,9 @@ extern int smb3_validate_negotiate(const unsigned int, struct cifs_tcon *);
extern enum securityEnum smb2_select_sectype(struct TCP_Server_Info *,
enum securityEnum);
+extern __u8 smb2_parse_lease_state(struct TCP_Server_Info *server,
+ struct smb2_create_rsp *rsp,
+ unsigned int *epoch, char *lease_key);
extern int smb3_encryption_required(const struct cifs_tcon *tcon);
extern int smb2_validate_iov(unsigned int offset, unsigned int buffer_length,
struct kvec *iov, unsigned int min_buf_size);
diff --git a/fs/cifs/smb2status.h b/fs/cifs/smb2status.h
index 3d5f62150de4..447c0c6e4c64 100644
--- a/fs/cifs/smb2status.h
+++ b/fs/cifs/smb2status.h
@@ -30,9 +30,9 @@
*/
#define STATUS_SEVERITY_SUCCESS __constant_cpu_to_le32(0x0000)
-#define STATUS_SEVERITY_INFORMATIONAL __constanst_cpu_to_le32(0x0001)
-#define STATUS_SEVERITY_WARNING __constanst_cpu_to_le32(0x0002)
-#define STATUS_SEVERITY_ERROR __constanst_cpu_to_le32(0x0003)
+#define STATUS_SEVERITY_INFORMATIONAL cpu_to_le32(0x0001)
+#define STATUS_SEVERITY_WARNING cpu_to_le32(0x0002)
+#define STATUS_SEVERITY_ERROR cpu_to_le32(0x0003)
struct ntstatus {
/* Facility is the high 12 bits of the following field */
diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h
index d8b049afa606..fa226de48ef3 100644
--- a/fs/cifs/trace.h
+++ b/fs/cifs/trace.h
@@ -59,6 +59,8 @@ DEFINE_EVENT(smb3_rw_err_class, smb3_##name, \
DEFINE_SMB3_RW_ERR_EVENT(write_err);
DEFINE_SMB3_RW_ERR_EVENT(read_err);
DEFINE_SMB3_RW_ERR_EVENT(query_dir_err);
+DEFINE_SMB3_RW_ERR_EVENT(zero_err);
+DEFINE_SMB3_RW_ERR_EVENT(falloc_err);
/* For logging successful read or write */
@@ -104,9 +106,13 @@ DEFINE_EVENT(smb3_rw_done_class, smb3_##name, \
DEFINE_SMB3_RW_DONE_EVENT(write_enter);
DEFINE_SMB3_RW_DONE_EVENT(read_enter);
DEFINE_SMB3_RW_DONE_EVENT(query_dir_enter);
+DEFINE_SMB3_RW_DONE_EVENT(zero_enter);
+DEFINE_SMB3_RW_DONE_EVENT(falloc_enter);
DEFINE_SMB3_RW_DONE_EVENT(write_done);
DEFINE_SMB3_RW_DONE_EVENT(read_done);
DEFINE_SMB3_RW_DONE_EVENT(query_dir_done);
+DEFINE_SMB3_RW_DONE_EVENT(zero_done);
+DEFINE_SMB3_RW_DONE_EVENT(falloc_done);
/*
* For handle based calls other than read and write, and get/set info
@@ -242,6 +248,123 @@ DEFINE_SMB3_INF_ERR_EVENT(query_info_err);
DEFINE_SMB3_INF_ERR_EVENT(set_info_err);
DEFINE_SMB3_INF_ERR_EVENT(fsctl_err);
+DECLARE_EVENT_CLASS(smb3_inf_compound_enter_class,
+ TP_PROTO(unsigned int xid,
+ __u32 tid,
+ __u64 sesid,
+ const char *full_path),
+ TP_ARGS(xid, tid, sesid, full_path),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __string(path, full_path)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __assign_str(path, full_path);
+ ),
+ TP_printk("xid=%u sid=0x%llx tid=0x%x path=%s",
+ __entry->xid, __entry->sesid, __entry->tid,
+ __get_str(path))
+)
+
+#define DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(name) \
+DEFINE_EVENT(smb3_inf_compound_enter_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u32 tid, \
+ __u64 sesid, \
+ const char *full_path), \
+ TP_ARGS(xid, tid, sesid, full_path))
+
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(query_info_compound_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(hardlink_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rename_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rmdir_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_eof_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_info_compound_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter);
+
+
+DECLARE_EVENT_CLASS(smb3_inf_compound_done_class,
+ TP_PROTO(unsigned int xid,
+ __u32 tid,
+ __u64 sesid),
+ TP_ARGS(xid, tid, sesid),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ ),
+ TP_printk("xid=%u sid=0x%llx tid=0x%x",
+ __entry->xid, __entry->sesid, __entry->tid)
+)
+
+#define DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(name) \
+DEFINE_EVENT(smb3_inf_compound_done_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u32 tid, \
+ __u64 sesid), \
+ TP_ARGS(xid, tid, sesid))
+
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_info_compound_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(hardlink_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rename_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rmdir_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done);
+
+
+DECLARE_EVENT_CLASS(smb3_inf_compound_err_class,
+ TP_PROTO(unsigned int xid,
+ __u32 tid,
+ __u64 sesid,
+ int rc),
+ TP_ARGS(xid, tid, sesid, rc),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(int, rc)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->rc = rc;
+ ),
+ TP_printk("xid=%u sid=0x%llx tid=0x%x rc=%d",
+ __entry->xid, __entry->sesid, __entry->tid,
+ __entry->rc)
+)
+
+#define DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(name) \
+DEFINE_EVENT(smb3_inf_compound_err_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u32 tid, \
+ __u64 sesid, \
+ int rc), \
+ TP_ARGS(xid, tid, sesid, rc))
+
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_info_compound_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(hardlink_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rename_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rmdir_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err);
+
/*
* For logging SMB3 Status code and Command for responses which return errors
*/
@@ -713,6 +836,7 @@ DEFINE_EVENT(smb3_credit_class, smb3_##name, \
TP_ARGS(currmid, hostname, credits))
DEFINE_SMB3_CREDIT_EVENT(reconnect_with_invalid_credits);
+DEFINE_SMB3_CREDIT_EVENT(credit_timeout);
#endif /* _CIFS_TRACE_H */
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 7ce8a585abd6..1de8e996e566 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -486,15 +486,31 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
}
static int
-wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
- int *credits, unsigned int *instance)
+wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits,
+ const int timeout, const int flags,
+ unsigned int *instance)
{
int rc;
+ int *credits;
+ int optype;
+ long int t;
+
+ if (timeout < 0)
+ t = MAX_JIFFY_OFFSET;
+ else
+ t = msecs_to_jiffies(timeout);
+
+ optype = flags & CIFS_OP_MASK;
*instance = 0;
+ credits = server->ops->get_credits_field(server, optype);
+ /* Since an echo is already inflight, no need to wait to send another */
+ if (*credits <= 0 && optype == CIFS_ECHO_OP)
+ return -EAGAIN;
+
spin_lock(&server->req_lock);
- if (timeout == CIFS_ASYNC_OP) {
+ if ((flags & CIFS_TIMEOUT_MASK) == CIFS_ASYNC_OP) {
/* oplock breaks must not be held up */
server->in_flight++;
*credits -= 1;
@@ -504,14 +520,21 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
}
while (1) {
- if (*credits <= 0) {
+ if (*credits < num_credits) {
spin_unlock(&server->req_lock);
cifs_num_waiters_inc(server);
- rc = wait_event_killable(server->request_q,
- has_credits(server, credits));
+ rc = wait_event_killable_timeout(server->request_q,
+ has_credits(server, credits, num_credits), t);
cifs_num_waiters_dec(server);
- if (rc)
- return rc;
+ if (!rc) {
+ trace_smb3_credit_timeout(server->CurrentMid,
+ server->hostname, num_credits);
+ cifs_dbg(VFS, "wait timed out after %d ms\n",
+ timeout);
+ return -ENOTSUPP;
+ }
+ if (rc == -ERESTARTSYS)
+ return -ERESTARTSYS;
spin_lock(&server->req_lock);
} else {
if (server->tcpStatus == CifsExiting) {
@@ -520,14 +543,52 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
}
/*
+ * For normal commands, reserve the last MAX_COMPOUND
+ * credits to compound requests.
+ * Otherwise these compounds could be permanently
+ * starved for credits by single-credit requests.
+ *
+ * To prevent spinning CPU, block this thread until
+ * there are >MAX_COMPOUND credits available.
+ * But only do this is we already have a lot of
+ * credits in flight to avoid triggering this check
+ * for servers that are slow to hand out credits on
+ * new sessions.
+ */
+ if (!optype && num_credits == 1 &&
+ server->in_flight > 2 * MAX_COMPOUND &&
+ *credits <= MAX_COMPOUND) {
+ spin_unlock(&server->req_lock);
+ cifs_num_waiters_inc(server);
+ rc = wait_event_killable_timeout(
+ server->request_q,
+ has_credits(server, credits,
+ MAX_COMPOUND + 1),
+ t);
+ cifs_num_waiters_dec(server);
+ if (!rc) {
+ trace_smb3_credit_timeout(
+ server->CurrentMid,
+ server->hostname, num_credits);
+ cifs_dbg(VFS, "wait timed out after %d ms\n",
+ timeout);
+ return -ENOTSUPP;
+ }
+ if (rc == -ERESTARTSYS)
+ return -ERESTARTSYS;
+ spin_lock(&server->req_lock);
+ continue;
+ }
+
+ /*
* Can not count locking commands against total
* as they are allowed to block on server.
*/
/* update # of requests on the wire to server */
- if (timeout != CIFS_BLOCKING_OP) {
- *credits -= 1;
- server->in_flight++;
+ if ((flags & CIFS_TIMEOUT_MASK) != CIFS_BLOCKING_OP) {
+ *credits -= num_credits;
+ server->in_flight += num_credits;
*instance = server->reconnect_instance;
}
spin_unlock(&server->req_lock);
@@ -538,16 +599,36 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
}
static int
-wait_for_free_request(struct TCP_Server_Info *server, const int timeout,
- const int optype, unsigned int *instance)
+wait_for_free_request(struct TCP_Server_Info *server, const int flags,
+ unsigned int *instance)
{
- int *val;
+ return wait_for_free_credits(server, 1, -1, flags,
+ instance);
+}
- val = server->ops->get_credits_field(server, optype);
- /* Since an echo is already inflight, no need to wait to send another */
- if (*val <= 0 && optype == CIFS_ECHO_OP)
- return -EAGAIN;
- return wait_for_free_credits(server, timeout, val, instance);
+static int
+wait_for_compound_request(struct TCP_Server_Info *server, int num,
+ const int flags, unsigned int *instance)
+{
+ int *credits;
+
+ credits = server->ops->get_credits_field(server, flags & CIFS_OP_MASK);
+
+ spin_lock(&server->req_lock);
+ if (*credits < num) {
+ /*
+ * Return immediately if not too many requests in flight since
+ * we will likely be stuck on waiting for credits.
+ */
+ if (server->in_flight < num - *credits) {
+ spin_unlock(&server->req_lock);
+ return -ENOTSUPP;
+ }
+ }
+ spin_unlock(&server->req_lock);
+
+ return wait_for_free_credits(server, num, 60000, flags,
+ instance);
}
int
@@ -646,16 +727,16 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
mid_handle_t *handle, void *cbdata, const int flags,
const struct cifs_credits *exist_credits)
{
- int rc, timeout, optype;
+ int rc;
struct mid_q_entry *mid;
struct cifs_credits credits = { .value = 0, .instance = 0 };
unsigned int instance;
+ int optype;
- timeout = flags & CIFS_TIMEOUT_MASK;
optype = flags & CIFS_OP_MASK;
if ((flags & CIFS_HAS_CREDITS) == 0) {
- rc = wait_for_free_request(server, timeout, optype, &instance);
+ rc = wait_for_free_request(server, flags, &instance);
if (rc)
return rc;
credits.value = 1;
@@ -871,18 +952,15 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
const int flags, const int num_rqst, struct smb_rqst *rqst,
int *resp_buf_type, struct kvec *resp_iov)
{
- int i, j, rc = 0;
- int timeout, optype;
+ int i, j, optype, rc = 0;
struct mid_q_entry *midQ[MAX_COMPOUND];
bool cancelled_mid[MAX_COMPOUND] = {false};
struct cifs_credits credits[MAX_COMPOUND] = {
{ .value = 0, .instance = 0 }
};
unsigned int instance;
- unsigned int first_instance = 0;
char *buf;
- timeout = flags & CIFS_TIMEOUT_MASK;
optype = flags & CIFS_OP_MASK;
for (i = 0; i < num_rqst; i++)
@@ -896,81 +974,24 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
if (ses->server->tcpStatus == CifsExiting)
return -ENOENT;
- spin_lock(&ses->server->req_lock);
- if (ses->server->credits < num_rqst) {
- /*
- * Return immediately if not too many requests in flight since
- * we will likely be stuck on waiting for credits.
- */
- if (ses->server->in_flight < num_rqst - ses->server->credits) {
- spin_unlock(&ses->server->req_lock);
- return -ENOTSUPP;
- }
- } else {
- /* enough credits to send the whole compounded request */
- ses->server->credits -= num_rqst;
- ses->server->in_flight += num_rqst;
- first_instance = ses->server->reconnect_instance;
- }
- spin_unlock(&ses->server->req_lock);
-
- if (first_instance) {
- cifs_dbg(FYI, "Acquired %d credits at once\n", num_rqst);
- for (i = 0; i < num_rqst; i++) {
- credits[i].value = 1;
- credits[i].instance = first_instance;
- }
- goto setup_rqsts;
- }
-
/*
- * There are not enough credits to send the whole compound request but
- * there are requests in flight that may bring credits from the server.
+ * Wait for all the requests to become available.
* This approach still leaves the possibility to be stuck waiting for
* credits if the server doesn't grant credits to the outstanding
- * requests. This should be fixed by returning immediately and letting
- * a caller fallback to sequential commands instead of compounding.
- * Ensure we obtain 1 credit per request in the compound chain.
+ * requests and if the client is completely idle, not generating any
+ * other requests.
+ * This can be handled by the eventual session reconnect.
*/
- for (i = 0; i < num_rqst; i++) {
- rc = wait_for_free_request(ses->server, timeout, optype,
- &instance);
-
- if (rc == 0) {
- credits[i].value = 1;
- credits[i].instance = instance;
- /*
- * All parts of the compound chain must get credits from
- * the same session, otherwise we may end up using more
- * credits than the server granted. If there were
- * reconnects in between, return -EAGAIN and let callers
- * handle it.
- */
- if (i == 0)
- first_instance = instance;
- else if (first_instance != instance) {
- i++;
- rc = -EAGAIN;
- }
- }
+ rc = wait_for_compound_request(ses->server, num_rqst, flags,
+ &instance);
+ if (rc)
+ return rc;
- if (rc) {
- /*
- * We haven't sent an SMB packet to the server yet but
- * we already obtained credits for i requests in the
- * compound chain - need to return those credits back
- * for future use. Note that we need to call add_credits
- * multiple times to match the way we obtained credits
- * in the first place and to account for in flight
- * requests correctly.
- */
- for (j = 0; j < i; j++)
- add_credits(ses->server, &credits[j], optype);
- return rc;
- }
+ for (i = 0; i < num_rqst; i++) {
+ credits[i].value = 1;
+ credits[i].instance = instance;
}
-setup_rqsts:
/*
* Make sure that we sign in the same order that we send on this socket
* and avoid races inside tcp sendmsg code that could cause corruption
@@ -981,14 +1002,12 @@ setup_rqsts:
/*
* All the parts of the compound chain belong obtained credits from the
- * same session (see the appropriate checks above). In the same time
- * there might be reconnects after those checks but before we acquired
- * the srv_mutex. We can not use credits obtained from the previous
+ * same session. We can not use credits obtained from the previous
* session to send this request. Check if there were reconnects after
* we obtained credits and return -EAGAIN in such cases to let callers
* handle it.
*/
- if (first_instance != ses->server->reconnect_instance) {
+ if (instance != ses->server->reconnect_instance) {
mutex_unlock(&ses->server->srv_mutex);
for (j = 0; j < num_rqst; j++)
add_credits(ses->server, &credits[j], optype);
@@ -1057,7 +1076,7 @@ setup_rqsts:
smb311_update_preauth_hash(ses, rqst[0].rq_iov,
rqst[0].rq_nvec);
- if (timeout == CIFS_ASYNC_OP)
+ if ((flags & CIFS_TIMEOUT_MASK) == CIFS_ASYNC_OP)
goto out;
for (i = 0; i < num_rqst; i++) {
@@ -1194,7 +1213,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
int
SendReceive(const unsigned int xid, struct cifs_ses *ses,
struct smb_hdr *in_buf, struct smb_hdr *out_buf,
- int *pbytes_returned, const int timeout)
+ int *pbytes_returned, const int flags)
{
int rc = 0;
struct mid_q_entry *midQ;
@@ -1225,7 +1244,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
return -EIO;
}
- rc = wait_for_free_request(ses->server, timeout, 0, &credits.instance);
+ rc = wait_for_free_request(ses->server, flags, &credits.instance);
if (rc)
return rc;
@@ -1264,7 +1283,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
if (rc < 0)
goto out;
- if (timeout == CIFS_ASYNC_OP)
+ if ((flags & CIFS_TIMEOUT_MASK) == CIFS_ASYNC_OP)
goto out;
rc = wait_for_response(ses->server, midQ);
@@ -1367,8 +1386,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
return -EIO;
}
- rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP, 0,
- &instance);
+ rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP, &instance);
if (rc)
return rc;
diff --git a/fs/dax.c b/fs/dax.c
index 6959837cc465..ca0671d55aa6 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -788,7 +788,7 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
address = pgoff_address(index, vma);
/*
- * Note because we provide start/end to follow_pte_pmd it will
+ * Note because we provide range to follow_pte_pmd it will
* call mmu_notifier_invalidate_range_start() on our behalf
* before taking any lock.
*/
@@ -843,9 +843,8 @@ unlock_pte:
static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
struct address_space *mapping, void *entry)
{
- unsigned long pfn;
+ unsigned long pfn, index, count;
long ret = 0;
- size_t size;
/*
* A page got tagged dirty in DAX mapping? Something is seriously
@@ -894,17 +893,18 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
xas_unlock_irq(xas);
/*
- * Even if dax_writeback_mapping_range() was given a wbc->range_start
- * in the middle of a PMD, the 'index' we are given will be aligned to
- * the start index of the PMD, as will the pfn we pull from 'entry'.
+ * If dax_writeback_mapping_range() was given a wbc->range_start
+ * in the middle of a PMD, the 'index' we use needs to be
+ * aligned to the start of the PMD.
* This allows us to flush for PMD_SIZE and not have to worry about
* partial PMD writebacks.
*/
pfn = dax_to_pfn(entry);
- size = PAGE_SIZE << dax_entry_order(entry);
+ count = 1UL << dax_entry_order(entry);
+ index = xas->xa_index & ~(count - 1);
- dax_entry_mkclean(mapping, xas->xa_index, pfn);
- dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size);
+ dax_entry_mkclean(mapping, index, pfn);
+ dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
/*
* After we have flushed the cache, we can clear the dirty tag. There
* cannot be new dirty data in the pfn after the flush has completed as
@@ -917,8 +917,7 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
dax_wake_entry(xas, entry, false);
- trace_dax_writeback_one(mapping->host, xas->xa_index,
- size >> PAGE_SHIFT);
+ trace_dax_writeback_one(mapping->host, index, count);
return ret;
put_unlocked:
@@ -1220,9 +1219,7 @@ static vm_fault_t dax_fault_return(int error)
{
if (error == 0)
return VM_FAULT_NOPAGE;
- if (error == -ENOMEM)
- return VM_FAULT_OOM;
- return VM_FAULT_SIGBUS;
+ return vmf_error(error);
}
/*
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index c53814539070..553a3f3300ae 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -455,6 +455,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
s->s_blocksize_bits = 10;
s->s_magic = DEVPTS_SUPER_MAGIC;
s->s_op = &devpts_sops;
+ s->s_d_op = &simple_dentry_operations;
s->s_time_gran = 1;
error = -ENOMEM;
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 031e5a82d556..06f77ca7f36e 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -97,7 +97,7 @@ config EXT4_FS_SECURITY
extended attributes for file security labels, say N.
config EXT4_DEBUG
- bool "EXT4 debugging support"
+ bool "Ext4 debugging support"
depends on EXT4_FS
help
Enables run-time debugging support for the ext4 filesystem.
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5012ddb6daf9..82ffdacdc7fa 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -425,6 +425,9 @@ struct flex_groups {
/* Flags that are appropriate for non-directories/regular files. */
#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)
+/* The only flags that should be swapped */
+#define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL)
+
/* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
{
@@ -1661,6 +1664,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */
#define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000
+extern void ext4_update_dynamic_rev(struct super_block *sb);
+
#define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \
static inline bool ext4_has_feature_##name(struct super_block *sb) \
{ \
@@ -1669,6 +1674,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \
} \
static inline void ext4_set_feature_##name(struct super_block *sb) \
{ \
+ ext4_update_dynamic_rev(sb); \
EXT4_SB(sb)->s_es->s_feature_compat |= \
cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \
} \
@@ -1686,6 +1692,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \
} \
static inline void ext4_set_feature_##name(struct super_block *sb) \
{ \
+ ext4_update_dynamic_rev(sb); \
EXT4_SB(sb)->s_es->s_feature_ro_compat |= \
cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \
} \
@@ -1703,6 +1710,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \
} \
static inline void ext4_set_feature_##name(struct super_block *sb) \
{ \
+ ext4_update_dynamic_rev(sb); \
EXT4_SB(sb)->s_es->s_feature_incompat |= \
cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \
} \
@@ -2666,7 +2674,6 @@ do { \
#endif
-extern void ext4_update_dynamic_rev(struct super_block *sb);
extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
__u32 compat);
extern int ext4_update_rocompat_feature(handle_t *handle,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 79d986dbf5af..0f89f5190cd7 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2956,14 +2956,17 @@ again:
if (err < 0)
goto out;
- } else if (sbi->s_cluster_ratio > 1 && end >= ex_end) {
+ } else if (sbi->s_cluster_ratio > 1 && end >= ex_end &&
+ partial.state == initial) {
/*
- * If there's an extent to the right its first cluster
- * contains the immediate right boundary of the
- * truncated/punched region. Set partial_cluster to
- * its negative value so it won't be freed if shared
- * with the current extent. The end < ee_block case
- * is handled in ext4_ext_rm_leaf().
+ * If we're punching, there's an extent to the right.
+ * If the partial cluster hasn't been set, set it to
+ * that extent's first cluster and its state to nofree
+ * so it won't be freed should it contain blocks to be
+ * removed. If it's already set (tofree/nofree), we're
+ * retrying and keep the original partial cluster info
+ * so a cluster marked tofree as a result of earlier
+ * extent removal is not lost.
*/
lblk = ex_end + 1;
err = ext4_ext_search_right(inode, path, &lblk, &pblk,
@@ -4048,18 +4051,8 @@ out:
} else
allocated = ret;
map->m_flags |= EXT4_MAP_NEW;
- /*
- * if we allocated more blocks than requested
- * we need to make sure we unmap the extra block
- * allocated. The actual needed block will get
- * unmapped later when we find the buffer_head marked
- * new.
- */
- if (allocated > map->m_len) {
- clean_bdev_aliases(inode->i_sb->s_bdev, newblock + map->m_len,
- allocated - map->m_len);
+ if (allocated > map->m_len)
allocated = map->m_len;
- }
map->m_len = allocated;
map_out:
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index e22dcfab308b..46b24da33a28 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -231,6 +231,7 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
break;
case DX_HASH_HALF_MD4_UNSIGNED:
str2hashbuf = str2hashbuf_unsigned;
+ /* fall through */
case DX_HASH_HALF_MD4:
p = name;
while (len > 0) {
@@ -244,6 +245,7 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
break;
case DX_HASH_TEA_UNSIGNED:
str2hashbuf = str2hashbuf_unsigned;
+ /* fall through */
case DX_HASH_TEA:
p = name;
while (len > 0) {
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index bf7fa1507e81..c2225f0d31b5 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -1183,18 +1183,21 @@ do_indirects:
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
i_data[EXT4_IND_BLOCK] = 0;
}
+ /* fall through */
case EXT4_IND_BLOCK:
nr = i_data[EXT4_DIND_BLOCK];
if (nr) {
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
i_data[EXT4_DIND_BLOCK] = 0;
}
+ /* fall through */
case EXT4_DIND_BLOCK:
nr = i_data[EXT4_TIND_BLOCK];
if (nr) {
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
i_data[EXT4_TIND_BLOCK] = 0;
}
+ /* fall through */
case EXT4_TIND_BLOCK:
;
}
@@ -1433,6 +1436,7 @@ do_indirects:
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
i_data[EXT4_IND_BLOCK] = 0;
}
+ /* fall through */
case EXT4_IND_BLOCK:
if (++n >= n2)
return 0;
@@ -1441,6 +1445,7 @@ do_indirects:
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
i_data[EXT4_DIND_BLOCK] = 0;
}
+ /* fall through */
case EXT4_DIND_BLOCK:
if (++n >= n2)
return 0;
@@ -1449,6 +1454,7 @@ do_indirects:
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
i_data[EXT4_TIND_BLOCK] = 0;
}
+ /* fall through */
case EXT4_TIND_BLOCK:
;
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4356ef6d728e..b54b261ded36 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -391,7 +391,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
* inode's preallocations.
*/
if ((ei->i_reserved_data_blocks == 0) &&
- (atomic_read(&inode->i_writecount) == 0))
+ !inode_is_open_for_write(inode))
ext4_discard_preallocations(inode);
}
@@ -678,8 +678,6 @@ found:
if (flags & EXT4_GET_BLOCKS_ZERO &&
map->m_flags & EXT4_MAP_MAPPED &&
map->m_flags & EXT4_MAP_NEW) {
- clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk,
- map->m_len);
ret = ext4_issue_zeroout(inode, map->m_lblk,
map->m_pblk, map->m_len);
if (ret) {
@@ -1194,7 +1192,6 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
if (err)
break;
if (buffer_new(bh)) {
- clean_bdev_bh_alias(bh);
if (PageUptodate(page)) {
clear_buffer_new(bh);
set_buffer_uptodate(bh);
@@ -2489,10 +2486,6 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
}
BUG_ON(map->m_len == 0);
- if (map->m_flags & EXT4_MAP_NEW) {
- clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk,
- map->m_len);
- }
return 0;
}
@@ -2835,12 +2828,12 @@ retry:
goto unplug;
}
ret = mpage_prepare_extent_to_map(&mpd);
+ /* Unlock pages we didn't use */
+ mpage_release_unused_pages(&mpd, false);
/* Submit prepared bio */
ext4_io_submit(&mpd.io_submit);
ext4_put_io_end_defer(mpd.io_submit.io_end);
mpd.io_submit.io_end = NULL;
- /* Unlock pages we didn't use */
- mpage_release_unused_pages(&mpd, false);
if (ret < 0)
goto unplug;
@@ -2908,10 +2901,11 @@ retry:
handle = NULL;
mpd.do_map = 0;
}
- /* Submit prepared bio */
- ext4_io_submit(&mpd.io_submit);
/* Unlock pages we didn't use */
mpage_release_unused_pages(&mpd, give_up_on_write);
+ /* Submit prepared bio */
+ ext4_io_submit(&mpd.io_submit);
+
/*
* Drop our io_end reference we got from init. We have
* to be careful and use deferred io_end finishing if
@@ -5349,7 +5343,6 @@ static int ext4_do_update_inode(handle_t *handle,
err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
if (err)
goto out_brelse;
- ext4_update_dynamic_rev(sb);
ext4_set_feature_large_file(sb);
ext4_handle_sync(handle);
err = ext4_handle_dirty_super(handle, sb);
@@ -6000,7 +5993,7 @@ int ext4_expand_extra_isize(struct inode *inode,
ext4_write_lock_xattr(inode, &no_expand);
- BUFFER_TRACE(iloc.bh, "get_write_access");
+ BUFFER_TRACE(iloc->bh, "get_write_access");
error = ext4_journal_get_write_access(handle, iloc->bh);
if (error) {
brelse(iloc->bh);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index d26bcac291bb..3c4f8bb59f8a 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -63,18 +63,20 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
loff_t isize;
struct ext4_inode_info *ei1;
struct ext4_inode_info *ei2;
+ unsigned long tmp;
ei1 = EXT4_I(inode1);
ei2 = EXT4_I(inode2);
swap(inode1->i_version, inode2->i_version);
- swap(inode1->i_blocks, inode2->i_blocks);
- swap(inode1->i_bytes, inode2->i_bytes);
swap(inode1->i_atime, inode2->i_atime);
swap(inode1->i_mtime, inode2->i_mtime);
memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
- swap(ei1->i_flags, ei2->i_flags);
+ tmp = ei1->i_flags & EXT4_FL_SHOULD_SWAP;
+ ei1->i_flags = (ei2->i_flags & EXT4_FL_SHOULD_SWAP) |
+ (ei1->i_flags & ~EXT4_FL_SHOULD_SWAP);
+ ei2->i_flags = tmp | (ei2->i_flags & ~EXT4_FL_SHOULD_SWAP);
swap(ei1->i_disksize, ei2->i_disksize);
ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS);
ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS);
@@ -115,28 +117,42 @@ static long swap_inode_boot_loader(struct super_block *sb,
int err;
struct inode *inode_bl;
struct ext4_inode_info *ei_bl;
-
- if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) ||
- IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) ||
- ext4_has_inline_data(inode))
- return -EINVAL;
-
- if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) ||
- !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN))
- return -EPERM;
+ qsize_t size, size_bl, diff;
+ blkcnt_t blocks;
+ unsigned short bytes;
inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL);
if (IS_ERR(inode_bl))
return PTR_ERR(inode_bl);
ei_bl = EXT4_I(inode_bl);
- filemap_flush(inode->i_mapping);
- filemap_flush(inode_bl->i_mapping);
-
/* Protect orig inodes against a truncate and make sure,
* that only 1 swap_inode_boot_loader is running. */
lock_two_nondirectories(inode, inode_bl);
+ if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) ||
+ IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) ||
+ (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) ||
+ ext4_has_inline_data(inode)) {
+ err = -EINVAL;
+ goto journal_err_out;
+ }
+
+ if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) ||
+ !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) {
+ err = -EPERM;
+ goto journal_err_out;
+ }
+
+ down_write(&EXT4_I(inode)->i_mmap_sem);
+ err = filemap_write_and_wait(inode->i_mapping);
+ if (err)
+ goto err_out;
+
+ err = filemap_write_and_wait(inode_bl->i_mapping);
+ if (err)
+ goto err_out;
+
/* Wait for all existing dio workers */
inode_dio_wait(inode);
inode_dio_wait(inode_bl);
@@ -147,7 +163,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
if (IS_ERR(handle)) {
err = -EINVAL;
- goto journal_err_out;
+ goto err_out;
}
/* Protect extent tree against block allocations via delalloc */
@@ -170,6 +186,13 @@ static long swap_inode_boot_loader(struct super_block *sb,
memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data));
}
+ err = dquot_initialize(inode);
+ if (err)
+ goto err_out1;
+
+ size = (qsize_t)(inode->i_blocks) * (1 << 9) + inode->i_bytes;
+ size_bl = (qsize_t)(inode_bl->i_blocks) * (1 << 9) + inode_bl->i_bytes;
+ diff = size - size_bl;
swap_inode_data(inode, inode_bl);
inode->i_ctime = inode_bl->i_ctime = current_time(inode);
@@ -183,27 +206,51 @@ static long swap_inode_boot_loader(struct super_block *sb,
err = ext4_mark_inode_dirty(handle, inode);
if (err < 0) {
+ /* No need to update quota information. */
ext4_warning(inode->i_sb,
"couldn't mark inode #%lu dirty (err %d)",
inode->i_ino, err);
/* Revert all changes: */
swap_inode_data(inode, inode_bl);
ext4_mark_inode_dirty(handle, inode);
- } else {
- err = ext4_mark_inode_dirty(handle, inode_bl);
- if (err < 0) {
- ext4_warning(inode_bl->i_sb,
- "couldn't mark inode #%lu dirty (err %d)",
- inode_bl->i_ino, err);
- /* Revert all changes: */
- swap_inode_data(inode, inode_bl);
- ext4_mark_inode_dirty(handle, inode);
- ext4_mark_inode_dirty(handle, inode_bl);
- }
+ goto err_out1;
+ }
+
+ blocks = inode_bl->i_blocks;
+ bytes = inode_bl->i_bytes;
+ inode_bl->i_blocks = inode->i_blocks;
+ inode_bl->i_bytes = inode->i_bytes;
+ err = ext4_mark_inode_dirty(handle, inode_bl);
+ if (err < 0) {
+ /* No need to update quota information. */
+ ext4_warning(inode_bl->i_sb,
+ "couldn't mark inode #%lu dirty (err %d)",
+ inode_bl->i_ino, err);
+ goto revert;
+ }
+
+ /* Bootloader inode should not be counted into quota information. */
+ if (diff > 0)
+ dquot_free_space(inode, diff);
+ else
+ err = dquot_alloc_space(inode, -1 * diff);
+
+ if (err < 0) {
+revert:
+ /* Revert all changes: */
+ inode_bl->i_blocks = blocks;
+ inode_bl->i_bytes = bytes;
+ swap_inode_data(inode, inode_bl);
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_mark_inode_dirty(handle, inode_bl);
}
+
+err_out1:
ext4_journal_stop(handle);
ext4_double_up_write_data_sem(inode, inode_bl);
+err_out:
+ up_write(&EXT4_I(inode)->i_mmap_sem);
journal_err_out:
unlock_two_nondirectories(inode, inode_bl);
iput(inode_bl);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e2248083cdca..6fb76d408093 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4176,9 +4176,8 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
>> bsbits;
- if ((size == isize) &&
- !ext4_fs_is_busy(sbi) &&
- (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
+ if ((size == isize) && !ext4_fs_is_busy(sbi) &&
+ !inode_is_open_for_write(ac->ac_inode)) {
ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
return;
}
@@ -4258,7 +4257,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
(unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
(unsigned) ar->lleft, (unsigned) ar->pleft,
(unsigned) ar->lright, (unsigned) ar->pright,
- atomic_read(&ar->inode->i_writecount) ? "" : "non-");
+ inode_is_open_for_write(ar->inode) ? "" : "non-");
return 0;
}
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 6f5305e9a6ac..3e9298e6a705 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -468,10 +468,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
ext4_io_submit(io);
continue;
}
- if (buffer_new(bh)) {
+ if (buffer_new(bh))
clear_buffer_new(bh);
- clean_bdev_bh_alias(bh);
- }
set_buffer_async_write(bh);
nr_to_submit++;
} while ((bh = bh->b_this_page) != head);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 48421de803b7..3d9b18505c0c 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1960,7 +1960,8 @@ retry:
le16_to_cpu(es->s_reserved_gdt_blocks);
n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb);
n_blocks_count = (ext4_fsblk_t)n_group *
- EXT4_BLOCKS_PER_GROUP(sb);
+ EXT4_BLOCKS_PER_GROUP(sb) +
+ le32_to_cpu(es->s_first_data_block);
n_group--; /* set to last group number */
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 60da0a6e4d86..f5b828bf1299 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2249,7 +2249,6 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
le16_add_cpu(&es->s_mnt_count, 1);
ext4_update_tstamp(es, s_mtime);
- ext4_update_dynamic_rev(sb);
if (sbi->s_journal)
ext4_set_feature_journal_needs_recovery(sb);
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 5e4e78fc0b3a..616c075da062 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -30,6 +30,7 @@ typedef enum {
attr_feature,
attr_pointer_ui,
attr_pointer_atomic,
+ attr_journal_task,
} attr_id_t;
typedef enum {
@@ -125,6 +126,14 @@ static ssize_t trigger_test_error(struct ext4_sb_info *sbi,
return count;
}
+static ssize_t journal_task_show(struct ext4_sb_info *sbi, char *buf)
+{
+ if (!sbi->s_journal)
+ return snprintf(buf, PAGE_SIZE, "<none>\n");
+ return snprintf(buf, PAGE_SIZE, "%d\n",
+ task_pid_vnr(sbi->s_journal->j_task));
+}
+
#define EXT4_ATTR(_name,_mode,_id) \
static struct ext4_attr ext4_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = _mode }, \
@@ -188,6 +197,7 @@ EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
EXT4_ATTR(first_error_time, 0444, first_error_time);
EXT4_ATTR(last_error_time, 0444, last_error_time);
+EXT4_ATTR(journal_task, 0444, journal_task);
static unsigned int old_bump_val = 128;
EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
@@ -217,6 +227,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(errors_count),
ATTR_LIST(first_error_time),
ATTR_LIST(last_error_time),
+ ATTR_LIST(journal_task),
NULL,
};
@@ -304,6 +315,8 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
return print_tstamp(buf, sbi->s_es, s_first_error_time);
case attr_last_error_time:
return print_tstamp(buf, sbi->s_es, s_last_error_time);
+ case attr_journal_task:
+ return journal_task_show(sbi, buf);
}
return 0;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 86ed9c686249..dc82e7757f67 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -829,6 +829,7 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage)
bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
if (IS_ERR(bh)) {
ret = PTR_ERR(bh);
+ bh = NULL;
goto out;
}
@@ -2903,6 +2904,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
if (error == -EIO)
EXT4_ERROR_INODE(inode, "block %llu read error",
EXT4_I(inode)->i_file_acl);
+ bh = NULL;
goto cleanup;
}
error = ext4_xattr_check_block(inode, bh);
@@ -3059,6 +3061,7 @@ ext4_xattr_block_cache_find(struct inode *inode,
if (IS_ERR(bh)) {
if (PTR_ERR(bh) == -ENOMEM)
return NULL;
+ bh = NULL;
EXT4_ERROR_INODE(inode, "block %lu read error",
(unsigned long)ce->e_value);
} else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index f955cd3e0677..a98e1b02279e 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -306,8 +306,9 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
goto skip_write;
/* collect a number of dirty meta pages and write together */
- if (wbc->for_kupdate ||
- get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META))
+ if (wbc->sync_mode != WB_SYNC_ALL &&
+ get_pages(sbi, F2FS_DIRTY_META) <
+ nr_pages_to_skip(sbi, META))
goto skip_write;
/* if locked failed, cp will flush dirty pages instead */
@@ -405,7 +406,7 @@ static int f2fs_set_meta_page_dirty(struct page *page)
if (!PageDirty(page)) {
__set_page_dirty_nobuffers(page);
inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
- SetPagePrivate(page);
+ f2fs_set_page_private(page, 0);
f2fs_trace_pid(page);
return 1;
}
@@ -956,7 +957,7 @@ void f2fs_update_dirty_page(struct inode *inode, struct page *page)
inode_inc_dirty_pages(inode);
spin_unlock(&sbi->inode_lock[type]);
- SetPagePrivate(page);
+ f2fs_set_page_private(page, 0);
f2fs_trace_pid(page);
}
@@ -1259,10 +1260,17 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
else
__clear_ckpt_flags(ckpt, CP_DISABLED_FLAG);
+ if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK))
+ __set_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG);
+ else
+ __clear_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG);
+
if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH))
__set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG);
- else
- __clear_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG);
+ /*
+ * TODO: we count on fsck.f2fs to clear this flag until we figure out
+ * missing cases which clear it incorrectly.
+ */
if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR))
__set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 568e1d09eb48..9727944139f2 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -301,9 +301,10 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi,
for (; start < F2FS_IO_SIZE(sbi); start++) {
struct page *page =
mempool_alloc(sbi->write_io_dummy,
- GFP_NOIO | __GFP_ZERO | __GFP_NOFAIL);
+ GFP_NOIO | __GFP_NOFAIL);
f2fs_bug_on(sbi, !page);
+ zero_user_segment(page, 0, PAGE_SIZE);
SetPagePrivate(page);
set_page_private(page, (unsigned long)DUMMY_WRITTEN_PAGE);
lock_page(page);
@@ -1553,6 +1554,9 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
if (last_block > last_block_in_file)
last_block = last_block_in_file;
+ /* just zeroing out page which is beyond EOF */
+ if (block_in_file >= last_block)
+ goto zero_out;
/*
* Map blocks using the previous result first.
*/
@@ -1565,16 +1569,11 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
* Then do more f2fs_map_blocks() calls until we are
* done with this page.
*/
- map.m_flags = 0;
-
- if (block_in_file < last_block) {
- map.m_lblk = block_in_file;
- map.m_len = last_block - block_in_file;
+ map.m_lblk = block_in_file;
+ map.m_len = last_block - block_in_file;
- if (f2fs_map_blocks(inode, &map, 0,
- F2FS_GET_BLOCK_DEFAULT))
- goto set_error_page;
- }
+ if (f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT))
+ goto set_error_page;
got_it:
if ((map.m_flags & F2FS_MAP_MAPPED)) {
block_nr = map.m_pblk + block_in_file - map.m_lblk;
@@ -1589,6 +1588,7 @@ got_it:
DATA_GENERIC))
goto set_error_page;
} else {
+zero_out:
zero_user_segment(page, 0, PAGE_SIZE);
if (!PageUptodate(page))
SetPageUptodate(page);
@@ -1863,8 +1863,13 @@ got_it:
if (fio->need_lock == LOCK_REQ)
f2fs_unlock_op(fio->sbi);
err = f2fs_inplace_write_data(fio);
- if (err && PageWriteback(page))
- end_page_writeback(page);
+ if (err) {
+ if (f2fs_encrypted_file(inode))
+ fscrypt_pullback_bio_page(&fio->encrypted_page,
+ true);
+ if (PageWriteback(page))
+ end_page_writeback(page);
+ }
trace_f2fs_do_write_data_page(fio->page, IPU);
set_inode_flag(inode, FI_UPDATE_WRITE);
return err;
@@ -2315,7 +2320,8 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to)
down_write(&F2FS_I(inode)->i_mmap_sem);
truncate_pagecache(inode, i_size);
- f2fs_truncate_blocks(inode, i_size, true, true);
+ if (!IS_NOQUOTA(inode))
+ f2fs_truncate_blocks(inode, i_size, true);
up_write(&F2FS_I(inode)->i_mmap_sem);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
@@ -2585,14 +2591,11 @@ static void f2fs_dio_submit_bio(struct bio *bio, struct inode *inode,
{
struct f2fs_private_dio *dio;
bool write = (bio_op(bio) == REQ_OP_WRITE);
- int err;
dio = f2fs_kzalloc(F2FS_I_SB(inode),
sizeof(struct f2fs_private_dio), GFP_NOFS);
- if (!dio) {
- err = -ENOMEM;
+ if (!dio)
goto out;
- }
dio->inode = inode;
dio->orig_end_io = bio->bi_end_io;
@@ -2710,12 +2713,10 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
clear_cold_data(page);
- /* This is atomic written page, keep Private */
if (IS_ATOMIC_WRITTEN_PAGE(page))
return f2fs_drop_inmem_page(inode, page);
- set_page_private(page, 0);
- ClearPagePrivate(page);
+ f2fs_clear_page_private(page);
}
int f2fs_release_page(struct page *page, gfp_t wait)
@@ -2729,8 +2730,7 @@ int f2fs_release_page(struct page *page, gfp_t wait)
return 0;
clear_cold_data(page);
- set_page_private(page, 0);
- ClearPagePrivate(page);
+ f2fs_clear_page_private(page);
return 1;
}
@@ -2798,12 +2798,8 @@ int f2fs_migrate_page(struct address_space *mapping,
return -EAGAIN;
}
- /*
- * A reference is expected if PagePrivate set when move mapping,
- * however F2FS breaks this for maintaining dirty page counts when
- * truncating pages. So here adjusting the 'extra_count' make it work.
- */
- extra_count = (atomic_written ? 1 : 0) - page_has_private(page);
+ /* one extra reference was held for atomic_write page */
+ extra_count = atomic_written ? 1 : 0;
rc = migrate_page_move_mapping(mapping, newpage,
page, mode, extra_count);
if (rc != MIGRATEPAGE_SUCCESS) {
@@ -2824,9 +2820,10 @@ int f2fs_migrate_page(struct address_space *mapping,
get_page(newpage);
}
- if (PagePrivate(page))
- SetPagePrivate(newpage);
- set_page_private(newpage, page_private(page));
+ if (PagePrivate(page)) {
+ f2fs_set_page_private(newpage, page_private(page));
+ f2fs_clear_page_private(page);
+ }
if (mode != MIGRATE_SYNC_NO_COPY)
migrate_page_copy(newpage, page);
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index fd7f170e2f2d..99e9a5c37b71 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -96,8 +96,10 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->free_secs = free_sections(sbi);
si->prefree_count = prefree_segments(sbi);
si->dirty_count = dirty_segments(sbi);
- si->node_pages = NODE_MAPPING(sbi)->nrpages;
- si->meta_pages = META_MAPPING(sbi)->nrpages;
+ if (sbi->node_inode)
+ si->node_pages = NODE_MAPPING(sbi)->nrpages;
+ if (sbi->meta_inode)
+ si->meta_pages = META_MAPPING(sbi)->nrpages;
si->nats = NM_I(sbi)->nat_cnt;
si->dirty_nats = NM_I(sbi)->dirty_nat_cnt;
si->sits = MAIN_SEGS(sbi);
@@ -175,7 +177,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
static void update_mem_info(struct f2fs_sb_info *sbi)
{
struct f2fs_stat_info *si = F2FS_STAT(sbi);
- unsigned npages;
int i;
if (si->base_mem)
@@ -258,10 +259,14 @@ get_cache:
sizeof(struct extent_node);
si->page_mem = 0;
- npages = NODE_MAPPING(sbi)->nrpages;
- si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
- npages = META_MAPPING(sbi)->nrpages;
- si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
+ if (sbi->node_inode) {
+ unsigned npages = NODE_MAPPING(sbi)->nrpages;
+ si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
+ }
+ if (sbi->meta_inode) {
+ unsigned npages = META_MAPPING(sbi)->nrpages;
+ si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
+ }
}
static int stat_show(struct seq_file *s, void *v)
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 713b36a10a79..59bc46017855 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -728,7 +728,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
!f2fs_truncate_hole(dir, page->index, page->index + 1)) {
f2fs_clear_page_cache_dirty_tag(page);
clear_page_dirty_for_io(page);
- ClearPagePrivate(page);
+ f2fs_clear_page_private(page);
ClearPageUptodate(page);
clear_cold_data(page);
inode_dec_dirty_pages(dir);
@@ -800,6 +800,10 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
if (de->name_len == 0) {
bit_pos++;
ctx->pos = start_pos + bit_pos;
+ printk_ratelimited(
+ "%s, invalid namelen(0), ino:%u, run fsck to fix.",
+ KERN_WARNING, le32_to_cpu(de->ino));
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
continue;
}
@@ -810,7 +814,8 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
/* check memory boundary before moving forward */
bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
- if (unlikely(bit_pos > d->max)) {
+ if (unlikely(bit_pos > d->max ||
+ le16_to_cpu(de->name_len) > F2FS_NAME_LEN)) {
f2fs_msg(sbi->sb, KERN_WARNING,
"%s: corrupted namelen=%d, run fsck to fix.",
__func__, le16_to_cpu(de->name_len));
@@ -891,7 +896,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
page_cache_sync_readahead(inode->i_mapping, ra, file, n,
min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES));
- dentry_page = f2fs_get_lock_data_page(inode, n, false);
+ dentry_page = f2fs_find_data_page(inode, n);
if (IS_ERR(dentry_page)) {
err = PTR_ERR(dentry_page);
if (err == -ENOENT) {
@@ -909,11 +914,11 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
err = f2fs_fill_dentries(ctx, &d,
n * NR_DENTRY_IN_BLOCK, &fstr);
if (err) {
- f2fs_put_page(dentry_page, 1);
+ f2fs_put_page(dentry_page, 0);
break;
}
- f2fs_put_page(dentry_page, 1);
+ f2fs_put_page(dentry_page, 0);
}
out_free:
fscrypt_fname_free_buffer(&fstr);
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 1cb0fcc67d2d..caf77fe8ac07 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -506,7 +506,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
unsigned int end = fofs + len;
unsigned int pos = (unsigned int)fofs;
bool updated = false;
- bool leftmost;
+ bool leftmost = false;
if (!et)
return;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 7ea5c9cede37..87f75ebd2fd6 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -190,6 +190,8 @@ enum {
#define DEF_CP_INTERVAL 60 /* 60 secs */
#define DEF_IDLE_INTERVAL 5 /* 5 secs */
#define DEF_DISABLE_INTERVAL 5 /* 5 secs */
+#define DEF_DISABLE_QUICK_INTERVAL 1 /* 1 secs */
+#define DEF_UMOUNT_DISCARD_TIMEOUT 5 /* 5 secs */
struct cp_control {
int reason;
@@ -253,7 +255,7 @@ struct discard_entry {
/* max discard pend list number */
#define MAX_PLIST_NUM 512
#define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \
- (MAX_PLIST_NUM - 1) : (blk_num - 1))
+ (MAX_PLIST_NUM - 1) : ((blk_num) - 1))
enum {
D_PREP, /* initial */
@@ -309,6 +311,7 @@ struct discard_policy {
bool sync; /* submit discard with REQ_SYNC flag */
bool ordered; /* issue discard by lba order */
unsigned int granularity; /* discard granularity */
+ int timeout; /* discard timeout for put_super */
};
struct discard_cmd_control {
@@ -455,7 +458,6 @@ struct f2fs_flush_device {
/* for inline stuff */
#define DEF_INLINE_RESERVED_SIZE 1
-#define DEF_MIN_INLINE_SIZE 1
static inline int get_extra_isize(struct inode *inode);
static inline int get_inline_xattr_addrs(struct inode *inode);
#define MAX_INLINE_DATA(inode) (sizeof(__le32) * \
@@ -1098,6 +1100,7 @@ enum {
SBI_IS_SHUTDOWN, /* shutdown by ioctl */
SBI_IS_RECOVERED, /* recovered orphan/data */
SBI_CP_DISABLED, /* CP was disabled last mount */
+ SBI_CP_DISABLED_QUICK, /* CP was disabled quickly */
SBI_QUOTA_NEED_FLUSH, /* need to flush quota info in CP */
SBI_QUOTA_SKIP_FLUSH, /* skip flushing quota in current CP */
SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */
@@ -1109,6 +1112,7 @@ enum {
DISCARD_TIME,
GC_TIME,
DISABLE_TIME,
+ UMOUNT_DISCARD_TIMEOUT,
MAX_TIME,
};
@@ -1237,8 +1241,6 @@ struct f2fs_sb_info {
unsigned int nquota_files; /* # of quota sysfile */
- u32 s_next_generation; /* for NFS support */
-
/* # of pages, see count_type */
atomic_t nr_pages[NR_COUNT_TYPE];
/* # of allocated blocks */
@@ -1798,13 +1800,12 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
{
atomic_inc(&sbi->nr_pages[count_type]);
- if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES ||
- count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA ||
- count_type == F2FS_RD_DATA || count_type == F2FS_RD_NODE ||
- count_type == F2FS_RD_META)
- return;
-
- set_sbi_flag(sbi, SBI_IS_DIRTY);
+ if (count_type == F2FS_DIRTY_DENTS ||
+ count_type == F2FS_DIRTY_NODES ||
+ count_type == F2FS_DIRTY_META ||
+ count_type == F2FS_DIRTY_QDATA ||
+ count_type == F2FS_DIRTY_IMETA)
+ set_sbi_flag(sbi, SBI_IS_DIRTY);
}
static inline void inode_inc_dirty_pages(struct inode *inode)
@@ -2156,10 +2157,17 @@ static inline bool is_idle(struct f2fs_sb_info *sbi, int type)
get_pages(sbi, F2FS_RD_META) || get_pages(sbi, F2FS_WB_DATA) ||
get_pages(sbi, F2FS_WB_CP_DATA) ||
get_pages(sbi, F2FS_DIO_READ) ||
- get_pages(sbi, F2FS_DIO_WRITE) ||
- atomic_read(&SM_I(sbi)->dcc_info->queued_discard) ||
- atomic_read(&SM_I(sbi)->fcc_info->queued_flush))
+ get_pages(sbi, F2FS_DIO_WRITE))
return false;
+
+ if (SM_I(sbi) && SM_I(sbi)->dcc_info &&
+ atomic_read(&SM_I(sbi)->dcc_info->queued_discard))
+ return false;
+
+ if (SM_I(sbi) && SM_I(sbi)->fcc_info &&
+ atomic_read(&SM_I(sbi)->fcc_info->queued_flush))
+ return false;
+
return f2fs_time_over(sbi, type);
}
@@ -2300,11 +2308,12 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr)
#define F2FS_EXTENTS_FL 0x00080000 /* Inode uses extents */
#define F2FS_EA_INODE_FL 0x00200000 /* Inode used for large EA */
#define F2FS_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */
+#define F2FS_NOCOW_FL 0x00800000 /* Do not cow file */
#define F2FS_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */
#define F2FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */
#define F2FS_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
-#define F2FS_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */
+#define F2FS_FL_USER_VISIBLE 0x30CBDFFF /* User visible flags */
#define F2FS_FL_USER_MODIFIABLE 0x204BC0FF /* User modifiable flags */
/* Flags we can manipulate with through F2FS_IOC_FSSETXATTR */
@@ -2761,9 +2770,9 @@ static inline int get_inline_xattr_addrs(struct inode *inode)
#define F2FS_OLD_ATTRIBUTE_SIZE (offsetof(struct f2fs_inode, i_addr))
#define F2FS_FITS_IN_INODE(f2fs_inode, extra_isize, field) \
- ((offsetof(typeof(*f2fs_inode), field) + \
+ ((offsetof(typeof(*(f2fs_inode)), field) + \
sizeof((f2fs_inode)->field)) \
- <= (F2FS_OLD_ATTRIBUTE_SIZE + extra_isize)) \
+ <= (F2FS_OLD_ATTRIBUTE_SIZE + (extra_isize))) \
static inline void f2fs_reset_iostat(struct f2fs_sb_info *sbi)
{
@@ -2792,8 +2801,8 @@ static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi,
#define __is_large_section(sbi) ((sbi)->segs_per_sec > 1)
-#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO(fio->type) == META && \
- (!is_read_io(fio->op) || fio->is_meta))
+#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO((fio)->type) == META && \
+ (!is_read_io((fio)->op) || (fio)->is_meta))
bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
block_t blkaddr, int type);
@@ -2825,13 +2834,33 @@ static inline bool is_valid_data_blkaddr(struct f2fs_sb_info *sbi,
return true;
}
+static inline void f2fs_set_page_private(struct page *page,
+ unsigned long data)
+{
+ if (PagePrivate(page))
+ return;
+
+ get_page(page);
+ SetPagePrivate(page);
+ set_page_private(page, data);
+}
+
+static inline void f2fs_clear_page_private(struct page *page)
+{
+ if (!PagePrivate(page))
+ return;
+
+ set_page_private(page, 0);
+ ClearPagePrivate(page);
+ f2fs_put_page(page, 0);
+}
+
/*
* file.c
*/
int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
void f2fs_truncate_data_blocks(struct dnode_of_data *dn);
-int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock,
- bool buf_write);
+int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock);
int f2fs_truncate(struct inode *inode);
int f2fs_getattr(const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int flags);
@@ -3005,7 +3034,7 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr);
bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr);
void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi);
void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi);
-bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi);
+bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi);
void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
struct cp_control *cpc);
void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi);
@@ -3610,8 +3639,6 @@ extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,
#define f2fs_build_fault_attr(sbi, rate, type) do { } while (0)
#endif
-#endif
-
static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
{
#ifdef CONFIG_QUOTA
@@ -3624,3 +3651,5 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
#endif
return false;
}
+
+#endif
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index ba5954f41e14..5742ab8b57dc 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -589,8 +589,7 @@ truncate_out:
return 0;
}
-int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock,
- bool buf_write)
+int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct dnode_of_data dn;
@@ -598,7 +597,6 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock,
int count = 0, err = 0;
struct page *ipage;
bool truncate_page = false;
- int flag = buf_write ? F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO;
trace_f2fs_truncate_blocks_enter(inode, from);
@@ -608,7 +606,7 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock,
goto free_partial;
if (lock)
- __do_map_lock(sbi, flag, true);
+ f2fs_lock_op(sbi);
ipage = f2fs_get_node_page(sbi, inode->i_ino);
if (IS_ERR(ipage)) {
@@ -646,7 +644,7 @@ free_next:
err = f2fs_truncate_inode_blocks(inode, free_from);
out:
if (lock)
- __do_map_lock(sbi, flag, false);
+ f2fs_unlock_op(sbi);
free_partial:
/* lastly zero out the first data page */
if (!err)
@@ -681,7 +679,7 @@ int f2fs_truncate(struct inode *inode)
return err;
}
- err = f2fs_truncate_blocks(inode, i_size_read(inode), true, false);
+ err = f2fs_truncate_blocks(inode, i_size_read(inode), true);
if (err)
return err;
@@ -768,7 +766,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
int err;
- bool size_changed = false;
if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
return -EIO;
@@ -843,8 +840,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
down_write(&F2FS_I(inode)->i_sem);
F2FS_I(inode)->last_disk_size = i_size_read(inode);
up_write(&F2FS_I(inode)->i_sem);
-
- size_changed = true;
}
__setattr_copy(inode, attr);
@@ -858,7 +853,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
}
/* file size may changed here */
- f2fs_mark_inode_dirty_sync(inode, size_changed);
+ f2fs_mark_inode_dirty_sync(inode, true);
/* inode change will produce dirty node pages flushed by checkpoint */
f2fs_balance_fs(F2FS_I_SB(inode), true);
@@ -1262,7 +1257,7 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
new_size = i_size_read(inode) - len;
truncate_pagecache(inode, new_size);
- ret = f2fs_truncate_blocks(inode, new_size, true, false);
+ ret = f2fs_truncate_blocks(inode, new_size, true);
up_write(&F2FS_I(inode)->i_mmap_sem);
if (!ret)
f2fs_i_size_write(inode, new_size);
@@ -1447,7 +1442,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
f2fs_balance_fs(sbi, true);
down_write(&F2FS_I(inode)->i_mmap_sem);
- ret = f2fs_truncate_blocks(inode, i_size_read(inode), true, false);
+ ret = f2fs_truncate_blocks(inode, i_size_read(inode), true);
up_write(&F2FS_I(inode)->i_mmap_sem);
if (ret)
return ret;
@@ -1651,6 +1646,8 @@ static int f2fs_ioc_getflags(struct file *filp, unsigned long arg)
flags |= F2FS_ENCRYPT_FL;
if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode))
flags |= F2FS_INLINE_DATA_FL;
+ if (is_inode_flag_set(inode, FI_PIN_FILE))
+ flags |= F2FS_NOCOW_FL;
flags &= F2FS_FL_USER_VISIBLE;
@@ -1750,10 +1747,12 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- if (!get_dirty_pages(inode))
- goto skip_flush;
-
- f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING,
+ /*
+ * Should wait end_io to count F2FS_WB_CP_DATA correctly by
+ * f2fs_is_atomic_file.
+ */
+ if (get_dirty_pages(inode))
+ f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING,
"Unexpected flush for atomic writes: ino=%lu, npages=%u",
inode->i_ino, get_dirty_pages(inode));
ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
@@ -1761,7 +1760,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
goto out;
}
-skip_flush:
+
set_inode_flag(inode, FI_ATOMIC_FILE);
clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
@@ -1968,11 +1967,11 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
break;
case F2FS_GOING_DOWN_NEED_FSCK:
set_sbi_flag(sbi, SBI_NEED_FSCK);
+ set_sbi_flag(sbi, SBI_CP_DISABLED_QUICK);
+ set_sbi_flag(sbi, SBI_IS_DIRTY);
/* do checkpoint only */
ret = f2fs_sync_fs(sb, 1);
- if (ret)
- goto out;
- break;
+ goto out;
default:
ret = -EINVAL;
goto out;
@@ -1988,6 +1987,9 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
out:
if (in != F2FS_GOING_DOWN_FULLSYNC)
mnt_drop_write_file(filp);
+
+ trace_f2fs_shutdown(sbi, in, ret);
+
return ret;
}
@@ -2871,8 +2873,8 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg)
__u32 pin;
int ret = 0;
- if (!inode_owner_or_capable(inode))
- return -EACCES;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
if (get_user(pin, (__u32 __user *)arg))
return -EFAULT;
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index d636cbcf68f2..bb6a152310ef 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -298,7 +298,7 @@ process_inline:
clear_inode_flag(inode, FI_INLINE_DATA);
f2fs_put_page(ipage, 1);
} else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
- if (f2fs_truncate_blocks(inode, 0, false, false))
+ if (f2fs_truncate_blocks(inode, 0, false))
return false;
goto process_inline;
}
@@ -470,7 +470,7 @@ static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry)
return 0;
punch_dentry_pages:
truncate_inode_pages(&dir->i_data, 0);
- f2fs_truncate_blocks(dir, 0, false, false);
+ f2fs_truncate_blocks(dir, 0, false);
f2fs_remove_dirty_inode(dir);
return err;
}
@@ -659,6 +659,12 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
if (IS_ERR(ipage))
return PTR_ERR(ipage);
+ /*
+ * f2fs_readdir was protected by inode.i_rwsem, it is safe to access
+ * ipage without page's lock held.
+ */
+ unlock_page(ipage);
+
inline_dentry = inline_data_addr(inode, ipage);
make_dentry_ptr_inline(inode, &d, inline_dentry);
@@ -667,7 +673,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
if (!err)
ctx->pos = d.max;
- f2fs_put_page(ipage, 1);
+ f2fs_put_page(ipage, 0);
return err < 0 ? err : 0;
}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index d910a820ae67..e7f2e8759315 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -14,6 +14,7 @@
#include "f2fs.h"
#include "node.h"
#include "segment.h"
+#include "xattr.h"
#include <trace/events/f2fs.h>
@@ -248,6 +249,20 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
return false;
}
+ if (f2fs_has_extra_attr(inode) &&
+ f2fs_sb_has_flexible_inline_xattr(sbi) &&
+ f2fs_has_inline_xattr(inode) &&
+ (!fi->i_inline_xattr_size ||
+ fi->i_inline_xattr_size > MAX_INLINE_XATTR_SIZE)) {
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "%s: inode (ino=%lx) has corrupted "
+ "i_inline_xattr_size: %d, max: %zu",
+ __func__, inode->i_ino, fi->i_inline_xattr_size,
+ MAX_INLINE_XATTR_SIZE);
+ return false;
+ }
+
if (F2FS_I(inode)->extent_tree) {
struct extent_info *ei = &F2FS_I(inode)->extent_tree->largest;
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index e967d27c1a89..f5e34e467003 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -10,6 +10,7 @@
#include <linux/pagemap.h>
#include <linux/sched.h>
#include <linux/ctype.h>
+#include <linux/random.h>
#include <linux/dcache.h>
#include <linux/namei.h>
#include <linux/quotaops.h>
@@ -50,7 +51,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
inode->i_blocks = 0;
inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
F2FS_I(inode)->i_crtime = inode->i_mtime;
- inode->i_generation = sbi->s_next_generation++;
+ inode->i_generation = prandom_u32();
if (S_ISDIR(inode->i_mode))
F2FS_I(inode)->i_current_depth = 1;
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 4f450e573312..3f99ab288695 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1920,7 +1920,9 @@ static int f2fs_write_node_pages(struct address_space *mapping,
f2fs_balance_fs_bg(sbi);
/* collect a number of dirty node pages and write together */
- if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE))
+ if (wbc->sync_mode != WB_SYNC_ALL &&
+ get_pages(sbi, F2FS_DIRTY_NODES) <
+ nr_pages_to_skip(sbi, NODE))
goto skip_write;
if (wbc->sync_mode == WB_SYNC_ALL)
@@ -1959,7 +1961,7 @@ static int f2fs_set_node_page_dirty(struct page *page)
if (!PageDirty(page)) {
__set_page_dirty_nobuffers(page);
inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
- SetPagePrivate(page);
+ f2fs_set_page_private(page, 0);
f2fs_trace_pid(page);
return 1;
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 9b79056d705d..aa7fe79b62b2 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -191,8 +191,7 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page)
f2fs_trace_pid(page);
- set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE);
- SetPagePrivate(page);
+ f2fs_set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE);
new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS);
@@ -215,7 +214,8 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page)
}
static int __revoke_inmem_pages(struct inode *inode,
- struct list_head *head, bool drop, bool recover)
+ struct list_head *head, bool drop, bool recover,
+ bool trylock)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct inmem_pages *cur, *tmp;
@@ -227,7 +227,16 @@ static int __revoke_inmem_pages(struct inode *inode,
if (drop)
trace_f2fs_commit_inmem_page(page, INMEM_DROP);
- lock_page(page);
+ if (trylock) {
+ /*
+ * to avoid deadlock in between page lock and
+ * inmem_lock.
+ */
+ if (!trylock_page(page))
+ continue;
+ } else {
+ lock_page(page);
+ }
f2fs_wait_on_page_writeback(page, DATA, true, true);
@@ -270,8 +279,7 @@ next:
ClearPageUptodate(page);
clear_cold_data(page);
}
- set_page_private(page, 0);
- ClearPagePrivate(page);
+ f2fs_clear_page_private(page);
f2fs_put_page(page, 1);
list_del(&cur->list);
@@ -318,13 +326,19 @@ void f2fs_drop_inmem_pages(struct inode *inode)
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode_info *fi = F2FS_I(inode);
- mutex_lock(&fi->inmem_lock);
- __revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
- spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
- if (!list_empty(&fi->inmem_ilist))
- list_del_init(&fi->inmem_ilist);
- spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
- mutex_unlock(&fi->inmem_lock);
+ while (!list_empty(&fi->inmem_pages)) {
+ mutex_lock(&fi->inmem_lock);
+ __revoke_inmem_pages(inode, &fi->inmem_pages,
+ true, false, true);
+
+ if (list_empty(&fi->inmem_pages)) {
+ spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+ if (!list_empty(&fi->inmem_ilist))
+ list_del_init(&fi->inmem_ilist);
+ spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
+ }
+ mutex_unlock(&fi->inmem_lock);
+ }
clear_inode_flag(inode, FI_ATOMIC_FILE);
fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0;
@@ -354,8 +368,7 @@ void f2fs_drop_inmem_page(struct inode *inode, struct page *page)
kmem_cache_free(inmem_entry_slab, cur);
ClearPageUptodate(page);
- set_page_private(page, 0);
- ClearPagePrivate(page);
+ f2fs_clear_page_private(page);
f2fs_put_page(page, 0);
trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE);
@@ -429,12 +442,15 @@ retry:
* recovery or rewrite & commit last transaction. For other
* error number, revoking was done by filesystem itself.
*/
- err = __revoke_inmem_pages(inode, &revoke_list, false, true);
+ err = __revoke_inmem_pages(inode, &revoke_list,
+ false, true, false);
/* drop all uncommitted pages */
- __revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
+ __revoke_inmem_pages(inode, &fi->inmem_pages,
+ true, false, false);
} else {
- __revoke_inmem_pages(inode, &revoke_list, false, false);
+ __revoke_inmem_pages(inode, &revoke_list,
+ false, false, false);
}
return err;
@@ -542,9 +558,13 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
static int __submit_flush_wait(struct f2fs_sb_info *sbi,
struct block_device *bdev)
{
- struct bio *bio = f2fs_bio_alloc(sbi, 0, true);
+ struct bio *bio;
int ret;
+ bio = f2fs_bio_alloc(sbi, 0, false);
+ if (!bio)
+ return -ENOMEM;
+
bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
bio_set_dev(bio, bdev);
ret = submit_bio_wait(bio);
@@ -868,6 +888,9 @@ int f2fs_disable_cp_again(struct f2fs_sb_info *sbi)
if (holes[DATA] > ovp || holes[NODE] > ovp)
return -EAGAIN;
+ if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK) &&
+ dirty_segments(sbi) > overprovision_segments(sbi))
+ return -EAGAIN;
return 0;
}
@@ -1037,6 +1060,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
dpolicy->io_aware_gran = MAX_PLIST_NUM;
+ dpolicy->timeout = 0;
if (discard_type == DPOLICY_BG) {
dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
@@ -1059,6 +1083,8 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
} else if (discard_type == DPOLICY_UMOUNT) {
dpolicy->max_requests = UINT_MAX;
dpolicy->io_aware = false;
+ /* we need to issue all to keep CP_TRIMMED_FLAG */
+ dpolicy->granularity = 1;
}
}
@@ -1424,7 +1450,14 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi,
int i, issued = 0;
bool io_interrupted = false;
+ if (dpolicy->timeout != 0)
+ f2fs_update_time(sbi, dpolicy->timeout);
+
for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
+ if (dpolicy->timeout != 0 &&
+ f2fs_time_over(sbi, dpolicy->timeout))
+ break;
+
if (i + 1 < dpolicy->granularity)
break;
@@ -1611,7 +1644,7 @@ void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi)
}
/* This comes from f2fs_put_super */
-bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi)
+bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi)
{
struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
struct discard_policy dpolicy;
@@ -1619,6 +1652,7 @@ bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi)
__init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT,
dcc->discard_granularity);
+ dpolicy.timeout = UMOUNT_DISCARD_TIMEOUT;
__issue_discard_cmd(sbi, &dpolicy);
dropped = __drop_discard_cmd(sbi);
@@ -3164,10 +3198,10 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio)
stat_inc_inplace_blocks(fio->sbi);
err = f2fs_submit_page_bio(fio);
- if (!err)
+ if (!err) {
update_device_state(fio);
-
- f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE);
+ f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE);
+ }
return err;
}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index a77f76f528b6..5c7ed0442d6e 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -865,7 +865,7 @@ static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force)
}
}
mutex_unlock(&dcc->cmd_lock);
- if (!wakeup)
+ if (!wakeup || !is_idle(sbi, DISCARD_TIME))
return;
wake_up:
dcc->discard_wake = 1;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index d1ccc52afc93..f2aaa2cc6b3e 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -269,7 +269,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype,
if (!qname) {
f2fs_msg(sb, KERN_ERR,
"Not enough memory for storing quotafile name");
- return -EINVAL;
+ return -ENOMEM;
}
if (F2FS_OPTION(sbi).s_qf_names[qtype]) {
if (strcmp(F2FS_OPTION(sbi).s_qf_names[qtype], qname) == 0)
@@ -586,7 +586,7 @@ static int parse_options(struct super_block *sb, char *options)
case Opt_io_size_bits:
if (args->from && match_int(args, &arg))
return -EINVAL;
- if (arg > __ilog2_u32(BIO_MAX_PAGES)) {
+ if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_PAGES)) {
f2fs_msg(sb, KERN_WARNING,
"Not support %d, larger than %d",
1 << arg, BIO_MAX_PAGES);
@@ -821,6 +821,8 @@ static int parse_options(struct super_block *sb, char *options)
}
if (test_opt(sbi, INLINE_XATTR_SIZE)) {
+ int min_size, max_size;
+
if (!f2fs_sb_has_extra_attr(sbi) ||
!f2fs_sb_has_flexible_inline_xattr(sbi)) {
f2fs_msg(sb, KERN_ERR,
@@ -834,14 +836,15 @@ static int parse_options(struct super_block *sb, char *options)
"set with inline_xattr option");
return -EINVAL;
}
- if (!F2FS_OPTION(sbi).inline_xattr_size ||
- F2FS_OPTION(sbi).inline_xattr_size >=
- DEF_ADDRS_PER_INODE -
- F2FS_TOTAL_EXTRA_ATTR_SIZE -
- DEF_INLINE_RESERVED_SIZE -
- DEF_MIN_INLINE_SIZE) {
+
+ min_size = sizeof(struct f2fs_xattr_header) / sizeof(__le32);
+ max_size = MAX_INLINE_XATTR_SIZE;
+
+ if (F2FS_OPTION(sbi).inline_xattr_size < min_size ||
+ F2FS_OPTION(sbi).inline_xattr_size > max_size) {
f2fs_msg(sb, KERN_ERR,
- "inline xattr size is out of range");
+ "inline xattr size is out of range: %d ~ %d",
+ min_size, max_size);
return -EINVAL;
}
}
@@ -915,6 +918,10 @@ static int f2fs_drop_inode(struct inode *inode)
sb_start_intwrite(inode->i_sb);
f2fs_i_size_write(inode, 0);
+ f2fs_submit_merged_write_cond(F2FS_I_SB(inode),
+ inode, NULL, 0, DATA);
+ truncate_inode_pages_final(inode->i_mapping);
+
if (F2FS_HAS_BLOCKS(inode))
f2fs_truncate(inode);
@@ -1048,7 +1055,7 @@ static void f2fs_put_super(struct super_block *sb)
}
/* be sure to wait for any on-going discard commands */
- dropped = f2fs_wait_discard_bios(sbi);
+ dropped = f2fs_issue_discard_timeout(sbi);
if ((f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) &&
!sbi->discard_blks && !dropped) {
@@ -1075,7 +1082,10 @@ static void f2fs_put_super(struct super_block *sb)
f2fs_bug_on(sbi, sbi->fsync_node_num);
iput(sbi->node_inode);
+ sbi->node_inode = NULL;
+
iput(sbi->meta_inode);
+ sbi->meta_inode = NULL;
/*
* iput() can update stat information, if f2fs_write_checkpoint()
@@ -1455,9 +1465,16 @@ static int f2fs_enable_quotas(struct super_block *sb);
static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
{
+ unsigned int s_flags = sbi->sb->s_flags;
struct cp_control cpc;
- int err;
+ int err = 0;
+ int ret;
+ if (s_flags & SB_RDONLY) {
+ f2fs_msg(sbi->sb, KERN_ERR,
+ "checkpoint=disable on readonly fs");
+ return -EINVAL;
+ }
sbi->sb->s_flags |= SB_ACTIVE;
f2fs_update_time(sbi, DISABLE_TIME);
@@ -1465,18 +1482,24 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
while (!f2fs_time_over(sbi, DISABLE_TIME)) {
mutex_lock(&sbi->gc_mutex);
err = f2fs_gc(sbi, true, false, NULL_SEGNO);
- if (err == -ENODATA)
+ if (err == -ENODATA) {
+ err = 0;
break;
+ }
if (err && err != -EAGAIN)
- return err;
+ break;
}
- err = sync_filesystem(sbi->sb);
- if (err)
- return err;
+ ret = sync_filesystem(sbi->sb);
+ if (ret || err) {
+ err = ret ? ret: err;
+ goto restore_flag;
+ }
- if (f2fs_disable_cp_again(sbi))
- return -EAGAIN;
+ if (f2fs_disable_cp_again(sbi)) {
+ err = -EAGAIN;
+ goto restore_flag;
+ }
mutex_lock(&sbi->gc_mutex);
cpc.reason = CP_PAUSE;
@@ -1485,7 +1508,9 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
sbi->unusable_block_count = 0;
mutex_unlock(&sbi->gc_mutex);
- return 0;
+restore_flag:
+ sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+ return err;
}
static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi)
@@ -2023,6 +2048,12 @@ void f2fs_quota_off_umount(struct super_block *sb)
set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
}
}
+ /*
+ * In case of checkpoint=disable, we must flush quota blocks.
+ * This can cause NULL exception for node_inode in end_io, since
+ * put_super already dropped it.
+ */
+ sync_filesystem(sb);
}
static void f2fs_truncate_quota_inode_pages(struct super_block *sb)
@@ -2703,6 +2734,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
sbi->interval_time[DISCARD_TIME] = DEF_IDLE_INTERVAL;
sbi->interval_time[GC_TIME] = DEF_IDLE_INTERVAL;
sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_INTERVAL;
+ sbi->interval_time[UMOUNT_DISCARD_TIMEOUT] =
+ DEF_UMOUNT_DISCARD_TIMEOUT;
clear_sbi_flag(sbi, SBI_NEED_FSCK);
for (i = 0; i < NR_COUNT_TYPE; i++)
@@ -3022,10 +3055,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
struct f2fs_super_block *raw_super;
struct inode *root;
int err;
- bool retry = true, need_fsck = false;
+ bool skip_recovery = false, need_fsck = false;
char *options = NULL;
int recovery, i, valid_super_block;
struct curseg_info *seg_i;
+ int retry_cnt = 1;
try_onemore:
err = -EINVAL;
@@ -3097,7 +3131,6 @@ try_onemore:
sb->s_maxbytes = sbi->max_file_blocks <<
le32_to_cpu(raw_super->log_blocksize);
sb->s_max_links = F2FS_LINK_MAX;
- get_random_bytes(&sbi->s_next_generation, sizeof(u32));
#ifdef CONFIG_QUOTA
sb->dq_op = &f2fs_quota_operations;
@@ -3200,6 +3233,10 @@ try_onemore:
if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_QUOTA_NEED_FSCK_FLAG))
set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
+ if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_DISABLED_QUICK_FLAG)) {
+ set_sbi_flag(sbi, SBI_CP_DISABLED_QUICK);
+ sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_QUICK_INTERVAL;
+ }
/* Initialize device list */
err = f2fs_scan_devices(sbi);
@@ -3288,7 +3325,7 @@ try_onemore:
sb->s_root = d_make_root(root); /* allocate root dentry */
if (!sb->s_root) {
err = -ENOMEM;
- goto free_root_inode;
+ goto free_node_inode;
}
err = f2fs_register_sysfs(sbi);
@@ -3310,7 +3347,7 @@ try_onemore:
goto free_meta;
if (unlikely(is_set_ckpt_flags(sbi, CP_DISABLED_FLAG)))
- goto skip_recovery;
+ goto reset_checkpoint;
/* recover fsynced data */
if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
@@ -3327,11 +3364,13 @@ try_onemore:
if (need_fsck)
set_sbi_flag(sbi, SBI_NEED_FSCK);
- if (!retry)
- goto skip_recovery;
+ if (skip_recovery)
+ goto reset_checkpoint;
err = f2fs_recover_fsync_data(sbi, false);
if (err < 0) {
+ if (err != -ENOMEM)
+ skip_recovery = true;
need_fsck = true;
f2fs_msg(sb, KERN_ERR,
"Cannot recover all fsync data errno=%d", err);
@@ -3347,14 +3386,14 @@ try_onemore:
goto free_meta;
}
}
-skip_recovery:
+reset_checkpoint:
/* f2fs_recover_fsync_data() cleared this already */
clear_sbi_flag(sbi, SBI_POR_DOING);
if (test_opt(sbi, DISABLE_CHECKPOINT)) {
err = f2fs_disable_checkpoint(sbi);
if (err)
- goto free_meta;
+ goto sync_free_meta;
} else if (is_set_ckpt_flags(sbi, CP_DISABLED_FLAG)) {
f2fs_enable_checkpoint(sbi);
}
@@ -3367,7 +3406,7 @@ skip_recovery:
/* After POR, we can run background GC thread.*/
err = f2fs_start_gc_thread(sbi);
if (err)
- goto free_meta;
+ goto sync_free_meta;
}
kvfree(options);
@@ -3387,8 +3426,14 @@ skip_recovery:
cur_cp_version(F2FS_CKPT(sbi)));
f2fs_update_time(sbi, CP_TIME);
f2fs_update_time(sbi, REQ_TIME);
+ clear_sbi_flag(sbi, SBI_CP_DISABLED_QUICK);
return 0;
+sync_free_meta:
+ /* safe to flush all the data */
+ sync_filesystem(sbi->sb);
+ retry_cnt = 0;
+
free_meta:
#ifdef CONFIG_QUOTA
f2fs_truncate_quota_inode_pages(sb);
@@ -3402,6 +3447,8 @@ free_meta:
* falls into an infinite loop in f2fs_sync_meta_pages().
*/
truncate_inode_pages_final(META_MAPPING(sbi));
+ /* evict some inodes being cached by GC */
+ evict_inodes(sb);
f2fs_unregister_sysfs(sbi);
free_root_inode:
dput(sb->s_root);
@@ -3410,6 +3457,7 @@ free_node_inode:
f2fs_release_ino_entry(sbi, true);
truncate_inode_pages_final(NODE_MAPPING(sbi));
iput(sbi->node_inode);
+ sbi->node_inode = NULL;
free_stats:
f2fs_destroy_stats(sbi);
free_nm:
@@ -3422,6 +3470,7 @@ free_devices:
free_meta_inode:
make_bad_inode(sbi->meta_inode);
iput(sbi->meta_inode);
+ sbi->meta_inode = NULL;
free_io_dummy:
mempool_destroy(sbi->write_io_dummy);
free_percpu:
@@ -3443,8 +3492,8 @@ free_sbi:
kvfree(sbi);
/* give only one another chance */
- if (retry) {
- retry = false;
+ if (retry_cnt > 0 && skip_recovery) {
+ retry_cnt--;
shrink_dcache_sb(sb);
goto try_onemore;
}
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 70da6801c86f..729f46a3c9ee 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -222,6 +222,8 @@ out:
#ifdef CONFIG_F2FS_FAULT_INJECTION
if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX))
return -EINVAL;
+ if (a->struct_type == FAULT_INFO_RATE && t >= UINT_MAX)
+ return -EINVAL;
#endif
if (a->struct_type == RESERVED_BLOCKS) {
spin_lock(&sbi->stat_lock);
@@ -278,10 +280,16 @@ out:
return count;
}
- *ui = t;
- if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0)
- f2fs_reset_iostat(sbi);
+ if (!strcmp(a->attr.name, "iostat_enable")) {
+ sbi->iostat_enable = !!t;
+ if (!sbi->iostat_enable)
+ f2fs_reset_iostat(sbi);
+ return count;
+ }
+
+ *ui = (unsigned int)t;
+
return count;
}
@@ -418,6 +426,8 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, discard_idle_interval,
interval_time[DISCARD_TIME]);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle_interval, interval_time[GC_TIME]);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info,
+ umount_discard_timeout, interval_time[UMOUNT_DISCARD_TIMEOUT]);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold);
@@ -475,6 +485,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(idle_interval),
ATTR_LIST(discard_idle_interval),
ATTR_LIST(gc_idle_interval),
+ ATTR_LIST(umount_discard_timeout),
ATTR_LIST(iostat_enable),
ATTR_LIST(readdir_ra),
ATTR_LIST(gc_pin_file_thresh),
diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c
index ce2a5eb210b6..d0ab533a9ce8 100644
--- a/fs/f2fs/trace.c
+++ b/fs/f2fs/trace.c
@@ -14,7 +14,7 @@
#include "trace.h"
static RADIX_TREE(pids, GFP_ATOMIC);
-static struct mutex pids_lock;
+static spinlock_t pids_lock;
static struct last_io_info last_io;
static inline void __print_last_io(void)
@@ -58,23 +58,29 @@ void f2fs_trace_pid(struct page *page)
set_page_private(page, (unsigned long)pid);
+retry:
if (radix_tree_preload(GFP_NOFS))
return;
- mutex_lock(&pids_lock);
+ spin_lock(&pids_lock);
p = radix_tree_lookup(&pids, pid);
if (p == current)
goto out;
if (p)
radix_tree_delete(&pids, pid);
- f2fs_radix_tree_insert(&pids, pid, current);
+ if (radix_tree_insert(&pids, pid, current)) {
+ spin_unlock(&pids_lock);
+ radix_tree_preload_end();
+ cond_resched();
+ goto retry;
+ }
trace_printk("%3x:%3x %4x %-16s\n",
MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev),
pid, current->comm);
out:
- mutex_unlock(&pids_lock);
+ spin_unlock(&pids_lock);
radix_tree_preload_end();
}
@@ -119,7 +125,7 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush)
void f2fs_build_trace_ios(void)
{
- mutex_init(&pids_lock);
+ spin_lock_init(&pids_lock);
}
#define PIDVEC_SIZE 128
@@ -147,7 +153,7 @@ void f2fs_destroy_trace_ios(void)
pid_t next_pid = 0;
unsigned int found;
- mutex_lock(&pids_lock);
+ spin_lock(&pids_lock);
while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) {
unsigned idx;
@@ -155,5 +161,5 @@ void f2fs_destroy_trace_ios(void)
for (idx = 0; idx < found; idx++)
radix_tree_delete(&pids, pid[idx]);
}
- mutex_unlock(&pids_lock);
+ spin_unlock(&pids_lock);
}
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 18d5ffbc5e8c..848a785abe25 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -224,11 +224,11 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode,
{
struct f2fs_xattr_entry *entry;
unsigned int inline_size = inline_xattr_size(inode);
+ void *max_addr = base_addr + inline_size;
list_for_each_xattr(entry, base_addr) {
- if ((void *)entry + sizeof(__u32) > base_addr + inline_size ||
- (void *)XATTR_NEXT_ENTRY(entry) + sizeof(__u32) >
- base_addr + inline_size) {
+ if ((void *)entry + sizeof(__u32) > max_addr ||
+ (void *)XATTR_NEXT_ENTRY(entry) > max_addr) {
*last_addr = entry;
return NULL;
}
@@ -239,6 +239,13 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode,
if (!memcmp(entry->e_name, name, len))
break;
}
+
+ /* inline xattr header or entry across max inline xattr size */
+ if (IS_XATTR_LAST_ENTRY(entry) &&
+ (void *)entry + sizeof(__u32) > max_addr) {
+ *last_addr = entry;
+ return NULL;
+ }
return entry;
}
@@ -340,7 +347,7 @@ check:
*base_addr = txattr_addr;
return 0;
out:
- kzfree(txattr_addr);
+ kvfree(txattr_addr);
return err;
}
@@ -383,7 +390,7 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage,
*base_addr = txattr_addr;
return 0;
fail:
- kzfree(txattr_addr);
+ kvfree(txattr_addr);
return err;
}
@@ -510,7 +517,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
}
error = size;
out:
- kzfree(base_addr);
+ kvfree(base_addr);
return error;
}
@@ -538,7 +545,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
if (!handler || (handler->list && !handler->list(dentry)))
continue;
- prefix = handler->prefix ?: handler->name;
+ prefix = xattr_prefix(handler);
prefix_len = strlen(prefix);
size = prefix_len + entry->e_name_len + 1;
if (buffer) {
@@ -556,7 +563,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
}
error = buffer_size - rest;
cleanup:
- kzfree(base_addr);
+ kvfree(base_addr);
return error;
}
@@ -687,7 +694,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
if (!error && S_ISDIR(inode->i_mode))
set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP);
exit:
- kzfree(base_addr);
+ kvfree(base_addr);
return error;
}
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 67db134da0f5..9172ee082ca8 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -78,6 +78,12 @@ struct f2fs_xattr_entry {
sizeof(struct f2fs_xattr_header) - \
sizeof(struct f2fs_xattr_entry))
+#define MAX_INLINE_XATTR_SIZE \
+ (DEF_ADDRS_PER_INODE - \
+ F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) - \
+ DEF_INLINE_RESERVED_SIZE - \
+ MIN_INLINE_DENTRY_SIZE / sizeof(__le32))
+
/*
* On-disk structure of f2fs_xattr
* We use inline xattrs space + 1 block for xattr.
diff --git a/fs/filesystems.c b/fs/filesystems.c
index b03f57b1105b..9135646e41ac 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -16,6 +16,7 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
+#include <linux/fs_parser.h>
/*
* Handling of filesystem drivers list.
@@ -73,6 +74,9 @@ int register_filesystem(struct file_system_type * fs)
int res = 0;
struct file_system_type ** p;
+ if (fs->parameters && !fs_validate_description(fs->parameters))
+ return -EINVAL;
+
BUG_ON(strchr(fs->name, '.'));
if (fs->next)
return -EBUSY;
diff --git a/fs/fs_context.c b/fs/fs_context.c
new file mode 100644
index 000000000000..87e3546b9a52
--- /dev/null
+++ b/fs/fs_context.c
@@ -0,0 +1,642 @@
+/* Provide a way to create a superblock configuration context within the kernel
+ * that allows a superblock to be set up prior to mounting.
+ *
+ * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/nsproxy.h>
+#include <linux/slab.h>
+#include <linux/magic.h>
+#include <linux/security.h>
+#include <linux/mnt_namespace.h>
+#include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
+#include <net/net_namespace.h>
+#include "mount.h"
+#include "internal.h"
+
+enum legacy_fs_param {
+ LEGACY_FS_UNSET_PARAMS,
+ LEGACY_FS_MONOLITHIC_PARAMS,
+ LEGACY_FS_INDIVIDUAL_PARAMS,
+};
+
+struct legacy_fs_context {
+ char *legacy_data; /* Data page for legacy filesystems */
+ size_t data_size;
+ enum legacy_fs_param param_type;
+};
+
+static int legacy_init_fs_context(struct fs_context *fc);
+
+static const struct constant_table common_set_sb_flag[] = {
+ { "dirsync", SB_DIRSYNC },
+ { "lazytime", SB_LAZYTIME },
+ { "mand", SB_MANDLOCK },
+ { "posixacl", SB_POSIXACL },
+ { "ro", SB_RDONLY },
+ { "sync", SB_SYNCHRONOUS },
+};
+
+static const struct constant_table common_clear_sb_flag[] = {
+ { "async", SB_SYNCHRONOUS },
+ { "nolazytime", SB_LAZYTIME },
+ { "nomand", SB_MANDLOCK },
+ { "rw", SB_RDONLY },
+ { "silent", SB_SILENT },
+};
+
+static const char *const forbidden_sb_flag[] = {
+ "bind",
+ "dev",
+ "exec",
+ "move",
+ "noatime",
+ "nodev",
+ "nodiratime",
+ "noexec",
+ "norelatime",
+ "nostrictatime",
+ "nosuid",
+ "private",
+ "rec",
+ "relatime",
+ "remount",
+ "shared",
+ "slave",
+ "strictatime",
+ "suid",
+ "unbindable",
+};
+
+/*
+ * Check for a common mount option that manipulates s_flags.
+ */
+static int vfs_parse_sb_flag(struct fs_context *fc, const char *key)
+{
+ unsigned int token;
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(forbidden_sb_flag); i++)
+ if (strcmp(key, forbidden_sb_flag[i]) == 0)
+ return -EINVAL;
+
+ token = lookup_constant(common_set_sb_flag, key, 0);
+ if (token) {
+ fc->sb_flags |= token;
+ fc->sb_flags_mask |= token;
+ return 0;
+ }
+
+ token = lookup_constant(common_clear_sb_flag, key, 0);
+ if (token) {
+ fc->sb_flags &= ~token;
+ fc->sb_flags_mask |= token;
+ return 0;
+ }
+
+ return -ENOPARAM;
+}
+
+/**
+ * vfs_parse_fs_param - Add a single parameter to a superblock config
+ * @fc: The filesystem context to modify
+ * @param: The parameter
+ *
+ * A single mount option in string form is applied to the filesystem context
+ * being set up. Certain standard options (for example "ro") are translated
+ * into flag bits without going to the filesystem. The active security module
+ * is allowed to observe and poach options. Any other options are passed over
+ * to the filesystem to parse.
+ *
+ * This may be called multiple times for a context.
+ *
+ * Returns 0 on success and a negative error code on failure. In the event of
+ * failure, supplementary error information may have been set.
+ */
+int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ int ret;
+
+ if (!param->key)
+ return invalf(fc, "Unnamed parameter\n");
+
+ ret = vfs_parse_sb_flag(fc, param->key);
+ if (ret != -ENOPARAM)
+ return ret;
+
+ ret = security_fs_context_parse_param(fc, param);
+ if (ret != -ENOPARAM)
+ /* Param belongs to the LSM or is disallowed by the LSM; so
+ * don't pass to the FS.
+ */
+ return ret;
+
+ if (fc->ops->parse_param) {
+ ret = fc->ops->parse_param(fc, param);
+ if (ret != -ENOPARAM)
+ return ret;
+ }
+
+ /* If the filesystem doesn't take any arguments, give it the
+ * default handling of source.
+ */
+ if (strcmp(param->key, "source") == 0) {
+ if (param->type != fs_value_is_string)
+ return invalf(fc, "VFS: Non-string source");
+ if (fc->source)
+ return invalf(fc, "VFS: Multiple sources");
+ fc->source = param->string;
+ param->string = NULL;
+ return 0;
+ }
+
+ return invalf(fc, "%s: Unknown parameter '%s'",
+ fc->fs_type->name, param->key);
+}
+EXPORT_SYMBOL(vfs_parse_fs_param);
+
+/**
+ * vfs_parse_fs_string - Convenience function to just parse a string.
+ */
+int vfs_parse_fs_string(struct fs_context *fc, const char *key,
+ const char *value, size_t v_size)
+{
+ int ret;
+
+ struct fs_parameter param = {
+ .key = key,
+ .type = fs_value_is_string,
+ .size = v_size,
+ };
+
+ if (v_size > 0) {
+ param.string = kmemdup_nul(value, v_size, GFP_KERNEL);
+ if (!param.string)
+ return -ENOMEM;
+ }
+
+ ret = vfs_parse_fs_param(fc, &param);
+ kfree(param.string);
+ return ret;
+}
+EXPORT_SYMBOL(vfs_parse_fs_string);
+
+/**
+ * generic_parse_monolithic - Parse key[=val][,key[=val]]* mount data
+ * @ctx: The superblock configuration to fill in.
+ * @data: The data to parse
+ *
+ * Parse a blob of data that's in key[=val][,key[=val]]* form. This can be
+ * called from the ->monolithic_mount_data() fs_context operation.
+ *
+ * Returns 0 on success or the error returned by the ->parse_option() fs_context
+ * operation on failure.
+ */
+int generic_parse_monolithic(struct fs_context *fc, void *data)
+{
+ char *options = data, *key;
+ int ret = 0;
+
+ if (!options)
+ return 0;
+
+ ret = security_sb_eat_lsm_opts(options, &fc->security);
+ if (ret)
+ return ret;
+
+ while ((key = strsep(&options, ",")) != NULL) {
+ if (*key) {
+ size_t v_len = 0;
+ char *value = strchr(key, '=');
+
+ if (value) {
+ if (value == key)
+ continue;
+ *value++ = 0;
+ v_len = strlen(value);
+ }
+ ret = vfs_parse_fs_string(fc, key, value, v_len);
+ if (ret < 0)
+ break;
+ }
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(generic_parse_monolithic);
+
+/**
+ * alloc_fs_context - Create a filesystem context.
+ * @fs_type: The filesystem type.
+ * @reference: The dentry from which this one derives (or NULL)
+ * @sb_flags: Filesystem/superblock flags (SB_*)
+ * @sb_flags_mask: Applicable members of @sb_flags
+ * @purpose: The purpose that this configuration shall be used for.
+ *
+ * Open a filesystem and create a mount context. The mount context is
+ * initialised with the supplied flags and, if a submount/automount from
+ * another superblock (referred to by @reference) is supplied, may have
+ * parameters such as namespaces copied across from that superblock.
+ */
+static struct fs_context *alloc_fs_context(struct file_system_type *fs_type,
+ struct dentry *reference,
+ unsigned int sb_flags,
+ unsigned int sb_flags_mask,
+ enum fs_context_purpose purpose)
+{
+ int (*init_fs_context)(struct fs_context *);
+ struct fs_context *fc;
+ int ret = -ENOMEM;
+
+ fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL);
+ if (!fc)
+ return ERR_PTR(-ENOMEM);
+
+ fc->purpose = purpose;
+ fc->sb_flags = sb_flags;
+ fc->sb_flags_mask = sb_flags_mask;
+ fc->fs_type = get_filesystem(fs_type);
+ fc->cred = get_current_cred();
+ fc->net_ns = get_net(current->nsproxy->net_ns);
+
+ switch (purpose) {
+ case FS_CONTEXT_FOR_MOUNT:
+ fc->user_ns = get_user_ns(fc->cred->user_ns);
+ break;
+ case FS_CONTEXT_FOR_SUBMOUNT:
+ fc->user_ns = get_user_ns(reference->d_sb->s_user_ns);
+ break;
+ case FS_CONTEXT_FOR_RECONFIGURE:
+ /* We don't pin any namespaces as the superblock's
+ * subscriptions cannot be changed at this point.
+ */
+ atomic_inc(&reference->d_sb->s_active);
+ fc->root = dget(reference);
+ break;
+ }
+
+ /* TODO: Make all filesystems support this unconditionally */
+ init_fs_context = fc->fs_type->init_fs_context;
+ if (!init_fs_context)
+ init_fs_context = legacy_init_fs_context;
+
+ ret = init_fs_context(fc);
+ if (ret < 0)
+ goto err_fc;
+ fc->need_free = true;
+ return fc;
+
+err_fc:
+ put_fs_context(fc);
+ return ERR_PTR(ret);
+}
+
+struct fs_context *fs_context_for_mount(struct file_system_type *fs_type,
+ unsigned int sb_flags)
+{
+ return alloc_fs_context(fs_type, NULL, sb_flags, 0,
+ FS_CONTEXT_FOR_MOUNT);
+}
+EXPORT_SYMBOL(fs_context_for_mount);
+
+struct fs_context *fs_context_for_reconfigure(struct dentry *dentry,
+ unsigned int sb_flags,
+ unsigned int sb_flags_mask)
+{
+ return alloc_fs_context(dentry->d_sb->s_type, dentry, sb_flags,
+ sb_flags_mask, FS_CONTEXT_FOR_RECONFIGURE);
+}
+EXPORT_SYMBOL(fs_context_for_reconfigure);
+
+struct fs_context *fs_context_for_submount(struct file_system_type *type,
+ struct dentry *reference)
+{
+ return alloc_fs_context(type, reference, 0, 0, FS_CONTEXT_FOR_SUBMOUNT);
+}
+EXPORT_SYMBOL(fs_context_for_submount);
+
+void fc_drop_locked(struct fs_context *fc)
+{
+ struct super_block *sb = fc->root->d_sb;
+ dput(fc->root);
+ fc->root = NULL;
+ deactivate_locked_super(sb);
+}
+
+static void legacy_fs_context_free(struct fs_context *fc);
+
+/**
+ * vfs_dup_fc_config: Duplicate a filesystem context.
+ * @src_fc: The context to copy.
+ */
+struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc)
+{
+ struct fs_context *fc;
+ int ret;
+
+ if (!src_fc->ops->dup)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ fc = kmemdup(src_fc, sizeof(struct fs_context), GFP_KERNEL);
+ if (!fc)
+ return ERR_PTR(-ENOMEM);
+
+ fc->fs_private = NULL;
+ fc->s_fs_info = NULL;
+ fc->source = NULL;
+ fc->security = NULL;
+ get_filesystem(fc->fs_type);
+ get_net(fc->net_ns);
+ get_user_ns(fc->user_ns);
+ get_cred(fc->cred);
+
+ /* Can't call put until we've called ->dup */
+ ret = fc->ops->dup(fc, src_fc);
+ if (ret < 0)
+ goto err_fc;
+
+ ret = security_fs_context_dup(fc, src_fc);
+ if (ret < 0)
+ goto err_fc;
+ return fc;
+
+err_fc:
+ put_fs_context(fc);
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(vfs_dup_fs_context);
+
+#ifdef CONFIG_PRINTK
+/**
+ * logfc - Log a message to a filesystem context
+ * @fc: The filesystem context to log to.
+ * @fmt: The format of the buffer.
+ */
+void logfc(struct fs_context *fc, const char *fmt, ...)
+{
+ va_list va;
+
+ va_start(va, fmt);
+
+ switch (fmt[0]) {
+ case 'w':
+ vprintk_emit(0, LOGLEVEL_WARNING, NULL, 0, fmt, va);
+ break;
+ case 'e':
+ vprintk_emit(0, LOGLEVEL_ERR, NULL, 0, fmt, va);
+ break;
+ default:
+ vprintk_emit(0, LOGLEVEL_NOTICE, NULL, 0, fmt, va);
+ break;
+ }
+
+ pr_cont("\n");
+ va_end(va);
+}
+EXPORT_SYMBOL(logfc);
+#endif
+
+/**
+ * put_fs_context - Dispose of a superblock configuration context.
+ * @fc: The context to dispose of.
+ */
+void put_fs_context(struct fs_context *fc)
+{
+ struct super_block *sb;
+
+ if (fc->root) {
+ sb = fc->root->d_sb;
+ dput(fc->root);
+ fc->root = NULL;
+ deactivate_super(sb);
+ }
+
+ if (fc->need_free && fc->ops && fc->ops->free)
+ fc->ops->free(fc);
+
+ security_free_mnt_opts(&fc->security);
+ put_net(fc->net_ns);
+ put_user_ns(fc->user_ns);
+ put_cred(fc->cred);
+ kfree(fc->subtype);
+ put_filesystem(fc->fs_type);
+ kfree(fc->source);
+ kfree(fc);
+}
+EXPORT_SYMBOL(put_fs_context);
+
+/*
+ * Free the config for a filesystem that doesn't support fs_context.
+ */
+static void legacy_fs_context_free(struct fs_context *fc)
+{
+ struct legacy_fs_context *ctx = fc->fs_private;
+
+ if (ctx) {
+ if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS)
+ kfree(ctx->legacy_data);
+ kfree(ctx);
+ }
+}
+
+/*
+ * Duplicate a legacy config.
+ */
+static int legacy_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc)
+{
+ struct legacy_fs_context *ctx;
+ struct legacy_fs_context *src_ctx = src_fc->fs_private;
+
+ ctx = kmemdup(src_ctx, sizeof(*src_ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS) {
+ ctx->legacy_data = kmemdup(src_ctx->legacy_data,
+ src_ctx->data_size, GFP_KERNEL);
+ if (!ctx->legacy_data) {
+ kfree(ctx);
+ return -ENOMEM;
+ }
+ }
+
+ fc->fs_private = ctx;
+ return 0;
+}
+
+/*
+ * Add a parameter to a legacy config. We build up a comma-separated list of
+ * options.
+ */
+static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct legacy_fs_context *ctx = fc->fs_private;
+ unsigned int size = ctx->data_size;
+ size_t len = 0;
+
+ if (strcmp(param->key, "source") == 0) {
+ if (param->type != fs_value_is_string)
+ return invalf(fc, "VFS: Legacy: Non-string source");
+ if (fc->source)
+ return invalf(fc, "VFS: Legacy: Multiple sources");
+ fc->source = param->string;
+ param->string = NULL;
+ return 0;
+ }
+
+ if ((fc->fs_type->fs_flags & FS_HAS_SUBTYPE) &&
+ strcmp(param->key, "subtype") == 0) {
+ if (param->type != fs_value_is_string)
+ return invalf(fc, "VFS: Legacy: Non-string subtype");
+ if (fc->subtype)
+ return invalf(fc, "VFS: Legacy: Multiple subtype");
+ fc->subtype = param->string;
+ param->string = NULL;
+ return 0;
+ }
+
+ if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS)
+ return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options");
+
+ switch (param->type) {
+ case fs_value_is_string:
+ len = 1 + param->size;
+ /* Fall through */
+ case fs_value_is_flag:
+ len += strlen(param->key);
+ break;
+ default:
+ return invalf(fc, "VFS: Legacy: Parameter type for '%s' not supported",
+ param->key);
+ }
+
+ if (len > PAGE_SIZE - 2 - size)
+ return invalf(fc, "VFS: Legacy: Cumulative options too large");
+ if (strchr(param->key, ',') ||
+ (param->type == fs_value_is_string &&
+ memchr(param->string, ',', param->size)))
+ return invalf(fc, "VFS: Legacy: Option '%s' contained comma",
+ param->key);
+ if (!ctx->legacy_data) {
+ ctx->legacy_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!ctx->legacy_data)
+ return -ENOMEM;
+ }
+
+ ctx->legacy_data[size++] = ',';
+ len = strlen(param->key);
+ memcpy(ctx->legacy_data + size, param->key, len);
+ size += len;
+ if (param->type == fs_value_is_string) {
+ ctx->legacy_data[size++] = '=';
+ memcpy(ctx->legacy_data + size, param->string, param->size);
+ size += param->size;
+ }
+ ctx->legacy_data[size] = '\0';
+ ctx->data_size = size;
+ ctx->param_type = LEGACY_FS_INDIVIDUAL_PARAMS;
+ return 0;
+}
+
+/*
+ * Add monolithic mount data.
+ */
+static int legacy_parse_monolithic(struct fs_context *fc, void *data)
+{
+ struct legacy_fs_context *ctx = fc->fs_private;
+
+ if (ctx->param_type != LEGACY_FS_UNSET_PARAMS) {
+ pr_warn("VFS: Can't mix monolithic and individual options\n");
+ return -EINVAL;
+ }
+
+ ctx->legacy_data = data;
+ ctx->param_type = LEGACY_FS_MONOLITHIC_PARAMS;
+ if (!ctx->legacy_data)
+ return 0;
+
+ if (fc->fs_type->fs_flags & FS_BINARY_MOUNTDATA)
+ return 0;
+ return security_sb_eat_lsm_opts(ctx->legacy_data, &fc->security);
+}
+
+/*
+ * Get a mountable root with the legacy mount command.
+ */
+static int legacy_get_tree(struct fs_context *fc)
+{
+ struct legacy_fs_context *ctx = fc->fs_private;
+ struct super_block *sb;
+ struct dentry *root;
+
+ root = fc->fs_type->mount(fc->fs_type, fc->sb_flags,
+ fc->source, ctx->legacy_data);
+ if (IS_ERR(root))
+ return PTR_ERR(root);
+
+ sb = root->d_sb;
+ BUG_ON(!sb);
+
+ fc->root = root;
+ return 0;
+}
+
+/*
+ * Handle remount.
+ */
+static int legacy_reconfigure(struct fs_context *fc)
+{
+ struct legacy_fs_context *ctx = fc->fs_private;
+ struct super_block *sb = fc->root->d_sb;
+
+ if (!sb->s_op->remount_fs)
+ return 0;
+
+ return sb->s_op->remount_fs(sb, &fc->sb_flags,
+ ctx ? ctx->legacy_data : NULL);
+}
+
+const struct fs_context_operations legacy_fs_context_ops = {
+ .free = legacy_fs_context_free,
+ .dup = legacy_fs_context_dup,
+ .parse_param = legacy_parse_param,
+ .parse_monolithic = legacy_parse_monolithic,
+ .get_tree = legacy_get_tree,
+ .reconfigure = legacy_reconfigure,
+};
+
+/*
+ * Initialise a legacy context for a filesystem that doesn't support
+ * fs_context.
+ */
+static int legacy_init_fs_context(struct fs_context *fc)
+{
+ fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL);
+ if (!fc->fs_private)
+ return -ENOMEM;
+ fc->ops = &legacy_fs_context_ops;
+ return 0;
+}
+
+int parse_monolithic_mount_data(struct fs_context *fc, void *data)
+{
+ int (*monolithic_mount_data)(struct fs_context *, void *);
+
+ monolithic_mount_data = fc->ops->parse_monolithic;
+ if (!monolithic_mount_data)
+ monolithic_mount_data = generic_parse_monolithic;
+
+ return monolithic_mount_data(fc, data);
+}
diff --git a/fs/fs_parser.c b/fs/fs_parser.c
new file mode 100644
index 000000000000..842e8f749db6
--- /dev/null
+++ b/fs/fs_parser.c
@@ -0,0 +1,447 @@
+/* Filesystem parameter parser.
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/export.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include <linux/namei.h>
+#include "internal.h"
+
+static const struct constant_table bool_names[] = {
+ { "0", false },
+ { "1", true },
+ { "false", false },
+ { "no", false },
+ { "true", true },
+ { "yes", true },
+};
+
+/**
+ * lookup_constant - Look up a constant by name in an ordered table
+ * @tbl: The table of constants to search.
+ * @tbl_size: The size of the table.
+ * @name: The name to look up.
+ * @not_found: The value to return if the name is not found.
+ */
+int __lookup_constant(const struct constant_table *tbl, size_t tbl_size,
+ const char *name, int not_found)
+{
+ unsigned int i;
+
+ for (i = 0; i < tbl_size; i++)
+ if (strcmp(name, tbl[i].name) == 0)
+ return tbl[i].value;
+
+ return not_found;
+}
+EXPORT_SYMBOL(__lookup_constant);
+
+static const struct fs_parameter_spec *fs_lookup_key(
+ const struct fs_parameter_description *desc,
+ const char *name)
+{
+ const struct fs_parameter_spec *p;
+
+ if (!desc->specs)
+ return NULL;
+
+ for (p = desc->specs; p->name; p++)
+ if (strcmp(p->name, name) == 0)
+ return p;
+
+ return NULL;
+}
+
+/*
+ * fs_parse - Parse a filesystem configuration parameter
+ * @fc: The filesystem context to log errors through.
+ * @desc: The parameter description to use.
+ * @param: The parameter.
+ * @result: Where to place the result of the parse
+ *
+ * Parse a filesystem configuration parameter and attempt a conversion for a
+ * simple parameter for which this is requested. If successful, the determined
+ * parameter ID is placed into @result->key, the desired type is indicated in
+ * @result->t and any converted value is placed into an appropriate member of
+ * the union in @result.
+ *
+ * The function returns the parameter number if the parameter was matched,
+ * -ENOPARAM if it wasn't matched and @desc->ignore_unknown indicated that
+ * unknown parameters are okay and -EINVAL if there was a conversion issue or
+ * the parameter wasn't recognised and unknowns aren't okay.
+ */
+int fs_parse(struct fs_context *fc,
+ const struct fs_parameter_description *desc,
+ struct fs_parameter *param,
+ struct fs_parse_result *result)
+{
+ const struct fs_parameter_spec *p;
+ const struct fs_parameter_enum *e;
+ int ret = -ENOPARAM, b;
+
+ result->has_value = !!param->string;
+ result->negated = false;
+ result->uint_64 = 0;
+
+ p = fs_lookup_key(desc, param->key);
+ if (!p) {
+ /* If we didn't find something that looks like "noxxx", see if
+ * "xxx" takes the "no"-form negative - but only if there
+ * wasn't an value.
+ */
+ if (result->has_value)
+ goto unknown_parameter;
+ if (param->key[0] != 'n' || param->key[1] != 'o' || !param->key[2])
+ goto unknown_parameter;
+
+ p = fs_lookup_key(desc, param->key + 2);
+ if (!p)
+ goto unknown_parameter;
+ if (!(p->flags & fs_param_neg_with_no))
+ goto unknown_parameter;
+ result->boolean = false;
+ result->negated = true;
+ }
+
+ if (p->flags & fs_param_deprecated)
+ warnf(fc, "%s: Deprecated parameter '%s'",
+ desc->name, param->key);
+
+ if (result->negated)
+ goto okay;
+
+ /* Certain parameter types only take a string and convert it. */
+ switch (p->type) {
+ case __fs_param_wasnt_defined:
+ return -EINVAL;
+ case fs_param_is_u32:
+ case fs_param_is_u32_octal:
+ case fs_param_is_u32_hex:
+ case fs_param_is_s32:
+ case fs_param_is_u64:
+ case fs_param_is_enum:
+ case fs_param_is_string:
+ if (param->type != fs_value_is_string)
+ goto bad_value;
+ if (!result->has_value) {
+ if (p->flags & fs_param_v_optional)
+ goto okay;
+ goto bad_value;
+ }
+ /* Fall through */
+ default:
+ break;
+ }
+
+ /* Try to turn the type we were given into the type desired by the
+ * parameter and give an error if we can't.
+ */
+ switch (p->type) {
+ case fs_param_is_flag:
+ if (param->type != fs_value_is_flag &&
+ (param->type != fs_value_is_string || result->has_value))
+ return invalf(fc, "%s: Unexpected value for '%s'",
+ desc->name, param->key);
+ result->boolean = true;
+ goto okay;
+
+ case fs_param_is_bool:
+ switch (param->type) {
+ case fs_value_is_flag:
+ result->boolean = true;
+ goto okay;
+ case fs_value_is_string:
+ if (param->size == 0) {
+ result->boolean = true;
+ goto okay;
+ }
+ b = lookup_constant(bool_names, param->string, -1);
+ if (b == -1)
+ goto bad_value;
+ result->boolean = b;
+ goto okay;
+ default:
+ goto bad_value;
+ }
+
+ case fs_param_is_u32:
+ ret = kstrtouint(param->string, 0, &result->uint_32);
+ goto maybe_okay;
+ case fs_param_is_u32_octal:
+ ret = kstrtouint(param->string, 8, &result->uint_32);
+ goto maybe_okay;
+ case fs_param_is_u32_hex:
+ ret = kstrtouint(param->string, 16, &result->uint_32);
+ goto maybe_okay;
+ case fs_param_is_s32:
+ ret = kstrtoint(param->string, 0, &result->int_32);
+ goto maybe_okay;
+ case fs_param_is_u64:
+ ret = kstrtoull(param->string, 0, &result->uint_64);
+ goto maybe_okay;
+
+ case fs_param_is_enum:
+ for (e = desc->enums; e->name[0]; e++) {
+ if (e->opt == p->opt &&
+ strcmp(e->name, param->string) == 0) {
+ result->uint_32 = e->value;
+ goto okay;
+ }
+ }
+ goto bad_value;
+
+ case fs_param_is_string:
+ goto okay;
+ case fs_param_is_blob:
+ if (param->type != fs_value_is_blob)
+ goto bad_value;
+ goto okay;
+
+ case fs_param_is_fd: {
+ if (param->type != fs_value_is_file)
+ goto bad_value;
+ goto okay;
+ }
+
+ case fs_param_is_blockdev:
+ case fs_param_is_path:
+ goto okay;
+ default:
+ BUG();
+ }
+
+maybe_okay:
+ if (ret < 0)
+ goto bad_value;
+okay:
+ return p->opt;
+
+bad_value:
+ return invalf(fc, "%s: Bad value for '%s'", desc->name, param->key);
+unknown_parameter:
+ return -ENOPARAM;
+}
+EXPORT_SYMBOL(fs_parse);
+
+/**
+ * fs_lookup_param - Look up a path referred to by a parameter
+ * @fc: The filesystem context to log errors through.
+ * @param: The parameter.
+ * @want_bdev: T if want a blockdev
+ * @_path: The result of the lookup
+ */
+int fs_lookup_param(struct fs_context *fc,
+ struct fs_parameter *param,
+ bool want_bdev,
+ struct path *_path)
+{
+ struct filename *f;
+ unsigned int flags = 0;
+ bool put_f;
+ int ret;
+
+ switch (param->type) {
+ case fs_value_is_string:
+ f = getname_kernel(param->string);
+ if (IS_ERR(f))
+ return PTR_ERR(f);
+ put_f = true;
+ break;
+ case fs_value_is_filename_empty:
+ flags = LOOKUP_EMPTY;
+ /* Fall through */
+ case fs_value_is_filename:
+ f = param->name;
+ put_f = false;
+ break;
+ default:
+ return invalf(fc, "%s: not usable as path", param->key);
+ }
+
+ ret = filename_lookup(param->dirfd, f, flags, _path, NULL);
+ if (ret < 0) {
+ errorf(fc, "%s: Lookup failure for '%s'", param->key, f->name);
+ goto out;
+ }
+
+ if (want_bdev &&
+ !S_ISBLK(d_backing_inode(_path->dentry)->i_mode)) {
+ path_put(_path);
+ _path->dentry = NULL;
+ _path->mnt = NULL;
+ errorf(fc, "%s: Non-blockdev passed as '%s'",
+ param->key, f->name);
+ ret = -ENOTBLK;
+ }
+
+out:
+ if (put_f)
+ putname(f);
+ return ret;
+}
+EXPORT_SYMBOL(fs_lookup_param);
+
+#ifdef CONFIG_VALIDATE_FS_PARSER
+/**
+ * validate_constant_table - Validate a constant table
+ * @name: Name to use in reporting
+ * @tbl: The constant table to validate.
+ * @tbl_size: The size of the table.
+ * @low: The lowest permissible value.
+ * @high: The highest permissible value.
+ * @special: One special permissible value outside of the range.
+ */
+bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size,
+ int low, int high, int special)
+{
+ size_t i;
+ bool good = true;
+
+ if (tbl_size == 0) {
+ pr_warn("VALIDATE C-TBL: Empty\n");
+ return true;
+ }
+
+ for (i = 0; i < tbl_size; i++) {
+ if (!tbl[i].name) {
+ pr_err("VALIDATE C-TBL[%zu]: Null\n", i);
+ good = false;
+ } else if (i > 0 && tbl[i - 1].name) {
+ int c = strcmp(tbl[i-1].name, tbl[i].name);
+
+ if (c == 0) {
+ pr_err("VALIDATE C-TBL[%zu]: Duplicate %s\n",
+ i, tbl[i].name);
+ good = false;
+ }
+ if (c > 0) {
+ pr_err("VALIDATE C-TBL[%zu]: Missorted %s>=%s\n",
+ i, tbl[i-1].name, tbl[i].name);
+ good = false;
+ }
+ }
+
+ if (tbl[i].value != special &&
+ (tbl[i].value < low || tbl[i].value > high)) {
+ pr_err("VALIDATE C-TBL[%zu]: %s->%d const out of range (%d-%d)\n",
+ i, tbl[i].name, tbl[i].value, low, high);
+ good = false;
+ }
+ }
+
+ return good;
+}
+
+/**
+ * fs_validate_description - Validate a parameter description
+ * @desc: The parameter description to validate.
+ */
+bool fs_validate_description(const struct fs_parameter_description *desc)
+{
+ const struct fs_parameter_spec *param, *p2;
+ const struct fs_parameter_enum *e;
+ const char *name = desc->name;
+ unsigned int nr_params = 0;
+ bool good = true, enums = false;
+
+ pr_notice("*** VALIDATE %s ***\n", name);
+
+ if (!name[0]) {
+ pr_err("VALIDATE Parser: No name\n");
+ name = "Unknown";
+ good = false;
+ }
+
+ if (desc->specs) {
+ for (param = desc->specs; param->name; param++) {
+ enum fs_parameter_type t = param->type;
+
+ /* Check that the type is in range */
+ if (t == __fs_param_wasnt_defined ||
+ t >= nr__fs_parameter_type) {
+ pr_err("VALIDATE %s: PARAM[%s] Bad type %u\n",
+ name, param->name, t);
+ good = false;
+ } else if (t == fs_param_is_enum) {
+ enums = true;
+ }
+
+ /* Check for duplicate parameter names */
+ for (p2 = desc->specs; p2 < param; p2++) {
+ if (strcmp(param->name, p2->name) == 0) {
+ pr_err("VALIDATE %s: PARAM[%s]: Duplicate\n",
+ name, param->name);
+ good = false;
+ }
+ }
+ }
+
+ nr_params = param - desc->specs;
+ }
+
+ if (desc->enums) {
+ if (!nr_params) {
+ pr_err("VALIDATE %s: Enum table but no parameters\n",
+ name);
+ good = false;
+ goto no_enums;
+ }
+ if (!enums) {
+ pr_err("VALIDATE %s: Enum table but no enum-type values\n",
+ name);
+ good = false;
+ goto no_enums;
+ }
+
+ for (e = desc->enums; e->name[0]; e++) {
+ /* Check that all entries in the enum table have at
+ * least one parameter that uses them.
+ */
+ for (param = desc->specs; param->name; param++) {
+ if (param->opt == e->opt &&
+ param->type != fs_param_is_enum) {
+ pr_err("VALIDATE %s: e[%lu] enum val for %s\n",
+ name, e - desc->enums, param->name);
+ good = false;
+ }
+ }
+ }
+
+ /* Check that all enum-type parameters have at least one enum
+ * value in the enum table.
+ */
+ for (param = desc->specs; param->name; param++) {
+ if (param->type != fs_param_is_enum)
+ continue;
+ for (e = desc->enums; e->name[0]; e++)
+ if (e->opt == param->opt)
+ break;
+ if (!e->name[0]) {
+ pr_err("VALIDATE %s: PARAM[%s] enum with no values\n",
+ name, param->name);
+ good = false;
+ }
+ }
+ } else {
+ if (enums) {
+ pr_err("VALIDATE %s: enum-type values, but no enum table\n",
+ name);
+ good = false;
+ goto no_enums;
+ }
+ }
+
+no_enums:
+ return good;
+}
+#endif /* CONFIG_VALIDATE_FS_PARSER */
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 989df5accaee..fe80bea4ad89 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -35,7 +35,9 @@ static ssize_t fuse_conn_abort_write(struct file *file, const char __user *buf,
{
struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
if (fc) {
- fuse_abort_conn(fc, true);
+ if (fc->abort_err)
+ fc->aborted = true;
+ fuse_abort_conn(fc);
fuse_conn_put(fc);
}
return count;
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 8f68181256c0..55a26f351467 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -141,10 +141,11 @@ static int cuse_open(struct inode *inode, struct file *file)
static int cuse_release(struct inode *inode, struct file *file)
{
+ struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = ff->fc;
- fuse_sync_release(ff, file->f_flags);
+ fuse_sync_release(fi, ff, file->f_flags);
fuse_conn_put(fc);
return 0;
@@ -407,7 +408,7 @@ err_unlock:
err_region:
unregister_chrdev_region(devt, 1);
err:
- fuse_abort_conn(fc, false);
+ fuse_abort_conn(fc);
goto out;
}
@@ -586,7 +587,7 @@ static ssize_t cuse_class_abort_store(struct device *dev,
{
struct cuse_conn *cc = dev_get_drvdata(dev);
- fuse_abort_conn(&cc->fc, false);
+ fuse_abort_conn(&cc->fc);
return count;
}
static DEVICE_ATTR(abort, 0200, NULL, cuse_class_abort_store);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 809c0f2f9942..8a63e52785e9 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -251,17 +251,18 @@ static struct fuse_req *get_reserved_req(struct fuse_conn *fc,
struct file *file)
{
struct fuse_req *req = NULL;
+ struct fuse_inode *fi = get_fuse_inode(file_inode(file));
struct fuse_file *ff = file->private_data;
do {
wait_event(fc->reserved_req_waitq, ff->reserved_req);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
if (ff->reserved_req) {
req = ff->reserved_req;
ff->reserved_req = NULL;
req->stolen_file = get_file(file);
}
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
} while (!req);
return req;
@@ -273,16 +274,17 @@ static struct fuse_req *get_reserved_req(struct fuse_conn *fc,
static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req)
{
struct file *file = req->stolen_file;
+ struct fuse_inode *fi = get_fuse_inode(file_inode(file));
struct fuse_file *ff = file->private_data;
WARN_ON(req->max_pages);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
memset(req, 0, sizeof(*req));
fuse_request_init(req, NULL, NULL, 0);
BUG_ON(ff->reserved_req);
ff->reserved_req = req;
wake_up_all(&fc->reserved_req_waitq);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
fput(file);
}
@@ -431,10 +433,16 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
if (test_and_set_bit(FR_FINISHED, &req->flags))
goto put_request;
-
- spin_lock(&fiq->waitq.lock);
- list_del_init(&req->intr_entry);
- spin_unlock(&fiq->waitq.lock);
+ /*
+ * test_and_set_bit() implies smp_mb() between bit
+ * changing and below intr_entry check. Pairs with
+ * smp_mb() from queue_interrupt().
+ */
+ if (!list_empty(&req->intr_entry)) {
+ spin_lock(&fiq->waitq.lock);
+ list_del_init(&req->intr_entry);
+ spin_unlock(&fiq->waitq.lock);
+ }
WARN_ON(test_bit(FR_PENDING, &req->flags));
WARN_ON(test_bit(FR_SENT, &req->flags));
if (test_bit(FR_BACKGROUND, &req->flags)) {
@@ -462,27 +470,43 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
fc->active_background--;
flush_bg_queue(fc);
spin_unlock(&fc->bg_lock);
+ } else {
+ /* Wake up waiter sleeping in request_wait_answer() */
+ wake_up(&req->waitq);
}
- wake_up(&req->waitq);
+
if (req->end)
req->end(fc, req);
put_request:
fuse_put_request(fc, req);
}
-static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
+static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
{
spin_lock(&fiq->waitq.lock);
- if (test_bit(FR_FINISHED, &req->flags)) {
+ /* Check for we've sent request to interrupt this req */
+ if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) {
spin_unlock(&fiq->waitq.lock);
- return;
+ return -EINVAL;
}
+
if (list_empty(&req->intr_entry)) {
list_add_tail(&req->intr_entry, &fiq->interrupts);
+ /*
+ * Pairs with smp_mb() implied by test_and_set_bit()
+ * from request_end().
+ */
+ smp_mb();
+ if (test_bit(FR_FINISHED, &req->flags)) {
+ list_del_init(&req->intr_entry);
+ spin_unlock(&fiq->waitq.lock);
+ return 0;
+ }
wake_up_locked(&fiq->waitq);
+ kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
}
spin_unlock(&fiq->waitq.lock);
- kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
+ return 0;
}
static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
@@ -1306,7 +1330,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
goto err_unlock;
if (!fiq->connected) {
- err = (fc->aborted && fc->abort_err) ? -ECONNABORTED : -ENODEV;
+ err = fc->aborted ? -ECONNABORTED : -ENODEV;
goto err_unlock;
}
@@ -1353,7 +1377,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
spin_lock(&fpq->lock);
clear_bit(FR_LOCKED, &req->flags);
if (!fpq->connected) {
- err = (fc->aborted && fc->abort_err) ? -ECONNABORTED : -ENODEV;
+ err = fc->aborted ? -ECONNABORTED : -ENODEV;
goto out_end;
}
if (err) {
@@ -1900,16 +1924,17 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
struct fuse_req *req;
struct fuse_out_header oh;
+ err = -EINVAL;
if (nbytes < sizeof(struct fuse_out_header))
- return -EINVAL;
+ goto out;
err = fuse_copy_one(cs, &oh, sizeof(oh));
if (err)
- goto err_finish;
+ goto copy_finish;
err = -EINVAL;
if (oh.len != nbytes)
- goto err_finish;
+ goto copy_finish;
/*
* Zero oh.unique indicates unsolicited notification message
@@ -1917,41 +1942,40 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
*/
if (!oh.unique) {
err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs);
- return err ? err : nbytes;
+ goto out;
}
err = -EINVAL;
if (oh.error <= -1000 || oh.error > 0)
- goto err_finish;
+ goto copy_finish;
spin_lock(&fpq->lock);
- err = -ENOENT;
- if (!fpq->connected)
- goto err_unlock_pq;
+ req = NULL;
+ if (fpq->connected)
+ req = request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT);
- req = request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT);
- if (!req)
- goto err_unlock_pq;
+ err = -ENOENT;
+ if (!req) {
+ spin_unlock(&fpq->lock);
+ goto copy_finish;
+ }
/* Is it an interrupt reply ID? */
if (oh.unique & FUSE_INT_REQ_BIT) {
__fuse_get_request(req);
spin_unlock(&fpq->lock);
- err = -EINVAL;
- if (nbytes != sizeof(struct fuse_out_header)) {
- fuse_put_request(fc, req);
- goto err_finish;
- }
-
- if (oh.error == -ENOSYS)
+ err = 0;
+ if (nbytes != sizeof(struct fuse_out_header))
+ err = -EINVAL;
+ else if (oh.error == -ENOSYS)
fc->no_interrupt = 1;
else if (oh.error == -EAGAIN)
- queue_interrupt(&fc->iq, req);
+ err = queue_interrupt(&fc->iq, req);
+
fuse_put_request(fc, req);
- fuse_copy_finish(cs);
- return nbytes;
+ goto copy_finish;
}
clear_bit(FR_SENT, &req->flags);
@@ -1977,14 +2001,12 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
spin_unlock(&fpq->lock);
request_end(fc, req);
-
+out:
return err ? err : nbytes;
- err_unlock_pq:
- spin_unlock(&fpq->lock);
- err_finish:
+copy_finish:
fuse_copy_finish(cs);
- return err;
+ goto out;
}
static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from)
@@ -2109,11 +2131,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait)
return mask;
}
-/*
- * Abort all requests on the given list (pending or processing)
- *
- * This function releases and reacquires fc->lock
- */
+/* Abort all requests on the given list (pending or processing) */
static void end_requests(struct fuse_conn *fc, struct list_head *head)
{
while (!list_empty(head)) {
@@ -2159,7 +2177,7 @@ static void end_polls(struct fuse_conn *fc)
* is OK, the request will in that case be removed from the list before we touch
* it.
*/
-void fuse_abort_conn(struct fuse_conn *fc, bool is_abort)
+void fuse_abort_conn(struct fuse_conn *fc)
{
struct fuse_iqueue *fiq = &fc->iq;
@@ -2175,7 +2193,6 @@ void fuse_abort_conn(struct fuse_conn *fc, bool is_abort)
fc->connected = 0;
spin_unlock(&fc->bg_lock);
- fc->aborted = is_abort;
fuse_set_initialized(fc);
list_for_each_entry(fud, &fc->devices, entry) {
struct fuse_pqueue *fpq = &fud->pq;
@@ -2253,7 +2270,7 @@ int fuse_dev_release(struct inode *inode, struct file *file)
/* Are we the last open device? */
if (atomic_dec_and_test(&fc->dev_count)) {
WARN_ON(fc->iq.fasync != NULL);
- fuse_abort_conn(fc, false);
+ fuse_abort_conn(fc);
}
fuse_dev_free(fud);
}
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index e909678afa2d..dd0f64f7bc06 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -149,21 +149,6 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args,
args->out.args[0].value = outarg;
}
-u64 fuse_get_attr_version(struct fuse_conn *fc)
-{
- u64 curr_version;
-
- /*
- * The spin lock isn't actually needed on 64bit archs, but we
- * don't yet care too much about such optimizations.
- */
- spin_lock(&fc->lock);
- curr_version = fc->attr_version;
- spin_unlock(&fc->lock);
-
- return curr_version;
-}
-
/*
* Check whether the dentry is still valid
*
@@ -222,9 +207,9 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
fuse_queue_forget(fc, forget, outarg.nodeid, 1);
goto invalid;
}
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
fi->nlookup++;
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
}
kfree(forget);
if (ret == -ENOMEM)
@@ -400,6 +385,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
struct fuse_create_in inarg;
struct fuse_open_out outopen;
struct fuse_entry_out outentry;
+ struct fuse_inode *fi;
struct fuse_file *ff;
/* Userspace expects S_IFREG in create mode */
@@ -451,7 +437,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
&outentry.attr, entry_attr_timeout(&outentry), 0);
if (!inode) {
flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
- fuse_sync_release(ff, flags);
+ fuse_sync_release(NULL, ff, flags);
fuse_queue_forget(fc, forget, outentry.nodeid, 1);
err = -ENOMEM;
goto out_err;
@@ -462,7 +448,8 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
fuse_dir_changed(dir);
err = finish_open(file, entry, generic_file_open);
if (err) {
- fuse_sync_release(ff, flags);
+ fi = get_fuse_inode(inode);
+ fuse_sync_release(fi, ff, flags);
} else {
file->private_data = ff;
fuse_finish_open(inode, file);
@@ -671,8 +658,8 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
struct inode *inode = d_inode(entry);
struct fuse_inode *fi = get_fuse_inode(inode);
- spin_lock(&fc->lock);
- fi->attr_version = ++fc->attr_version;
+ spin_lock(&fi->lock);
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
/*
* If i_nlink == 0 then unlink doesn't make sense, yet this can
* happen if userspace filesystem is careless. It would be
@@ -681,7 +668,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
*/
if (inode->i_nlink > 0)
drop_nlink(inode);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
fuse_invalidate_attr(inode);
fuse_dir_changed(dir);
fuse_invalidate_entry_cache(entry);
@@ -825,10 +812,10 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
if (!err) {
struct fuse_inode *fi = get_fuse_inode(inode);
- spin_lock(&fc->lock);
- fi->attr_version = ++fc->attr_version;
+ spin_lock(&fi->lock);
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
inc_nlink(inode);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
fuse_invalidate_attr(inode);
fuse_update_ctime(inode);
} else if (err == -EINTR) {
@@ -1356,15 +1343,14 @@ static void iattr_to_fattr(struct fuse_conn *fc, struct iattr *iattr,
*/
void fuse_set_nowrite(struct inode *inode)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
BUG_ON(!inode_is_locked(inode));
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
BUG_ON(fi->writectr < 0);
fi->writectr += FUSE_NOWRITE;
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
wait_event(fi->page_waitq, fi->writectr == FUSE_NOWRITE);
}
@@ -1385,11 +1371,11 @@ static void __fuse_release_nowrite(struct inode *inode)
void fuse_release_nowrite(struct inode *inode)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
__fuse_release_nowrite(inode);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
}
static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args,
@@ -1524,7 +1510,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
goto error;
}
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
/* the kernel maintains i_mtime locally */
if (trust_local_cmtime) {
if (attr->ia_valid & ATTR_MTIME)
@@ -1542,10 +1528,10 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
i_size_write(inode, outarg.attr.size);
if (is_truncate) {
- /* NOTE: this may release/reacquire fc->lock */
+ /* NOTE: this may release/reacquire fi->lock */
__fuse_release_nowrite(inode);
}
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
/*
* Only call invalidate_inode_pages2() after removing
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a59c16bd90ac..06096b60f1df 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -19,8 +19,6 @@
#include <linux/falloc.h>
#include <linux/uio.h>
-static const struct file_operations fuse_direct_io_file_operations;
-
static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
int opcode, struct fuse_open_out *outargp)
{
@@ -64,9 +62,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
RB_CLEAR_NODE(&ff->polled_node);
init_waitqueue_head(&ff->poll_wait);
- spin_lock(&fc->lock);
- ff->kh = ++fc->khctr;
- spin_unlock(&fc->lock);
+ ff->kh = atomic64_inc_return(&fc->khctr);
return ff;
}
@@ -94,7 +90,7 @@ static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir)
if (refcount_dec_and_test(&ff->count)) {
struct fuse_req *req = ff->reserved_req;
- if (ff->fc->no_open && !isdir) {
+ if (isdir ? ff->fc->no_opendir : ff->fc->no_open) {
/*
* Drop the release request when client does not
* implement 'open'
@@ -128,8 +124,9 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
return -ENOMEM;
ff->fh = 0;
- ff->open_flags = FOPEN_KEEP_CACHE; /* Default for no-open */
- if (!fc->no_open || isdir) {
+ /* Default for no-open */
+ ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0);
+ if (isdir ? !fc->no_opendir : !fc->no_open) {
struct fuse_open_out outarg;
int err;
@@ -138,11 +135,14 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
ff->fh = outarg.fh;
ff->open_flags = outarg.open_flags;
- } else if (err != -ENOSYS || isdir) {
+ } else if (err != -ENOSYS) {
fuse_file_free(ff);
return err;
} else {
- fc->no_open = 1;
+ if (isdir)
+ fc->no_opendir = 1;
+ else
+ fc->no_open = 1;
}
}
@@ -159,17 +159,16 @@ EXPORT_SYMBOL_GPL(fuse_do_open);
static void fuse_link_write_file(struct file *file)
{
struct inode *inode = file_inode(file);
- struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_file *ff = file->private_data;
/*
* file may be written through mmap, so chain it onto the
* inodes's write_file list
*/
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
if (list_empty(&ff->write_entry))
list_add(&ff->write_entry, &fi->write_files);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
}
void fuse_finish_open(struct inode *inode, struct file *file)
@@ -177,8 +176,6 @@ void fuse_finish_open(struct inode *inode, struct file *file)
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = get_fuse_conn(inode);
- if (ff->open_flags & FOPEN_DIRECT_IO)
- file->f_op = &fuse_direct_io_file_operations;
if (!(ff->open_flags & FOPEN_KEEP_CACHE))
invalidate_inode_pages2(inode->i_mapping);
if (ff->open_flags & FOPEN_NONSEEKABLE)
@@ -186,10 +183,10 @@ void fuse_finish_open(struct inode *inode, struct file *file)
if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
struct fuse_inode *fi = get_fuse_inode(inode);
- spin_lock(&fc->lock);
- fi->attr_version = ++fc->attr_version;
+ spin_lock(&fi->lock);
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
i_size_write(inode, 0);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
fuse_invalidate_attr(inode);
if (fc->writeback_cache)
file_update_time(file);
@@ -224,14 +221,20 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
return err;
}
-static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
+static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
+ int flags, int opcode)
{
struct fuse_conn *fc = ff->fc;
struct fuse_req *req = ff->reserved_req;
struct fuse_release_in *inarg = &req->misc.release.in;
+ /* Inode is NULL on error path of fuse_create_open() */
+ if (likely(fi)) {
+ spin_lock(&fi->lock);
+ list_del(&ff->write_entry);
+ spin_unlock(&fi->lock);
+ }
spin_lock(&fc->lock);
- list_del(&ff->write_entry);
if (!RB_EMPTY_NODE(&ff->polled_node))
rb_erase(&ff->polled_node, &fc->polled_files);
spin_unlock(&fc->lock);
@@ -249,11 +252,12 @@ static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
void fuse_release_common(struct file *file, bool isdir)
{
+ struct fuse_inode *fi = get_fuse_inode(file_inode(file));
struct fuse_file *ff = file->private_data;
struct fuse_req *req = ff->reserved_req;
int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
- fuse_prepare_release(ff, file->f_flags, opcode);
+ fuse_prepare_release(fi, ff, file->f_flags, opcode);
if (ff->flock) {
struct fuse_release_in *inarg = &req->misc.release.in;
@@ -295,10 +299,10 @@ static int fuse_release(struct inode *inode, struct file *file)
return 0;
}
-void fuse_sync_release(struct fuse_file *ff, int flags)
+void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, int flags)
{
WARN_ON(refcount_read(&ff->count) > 1);
- fuse_prepare_release(ff, flags, FUSE_RELEASE);
+ fuse_prepare_release(fi, ff, flags, FUSE_RELEASE);
/*
* iput(NULL) is a no-op and since the refcount is 1 and everything's
* synchronous, we are fine with not doing igrab() here"
@@ -329,33 +333,39 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
return (u64) v0 + ((u64) v1 << 32);
}
-/*
- * Check if any page in a range is under writeback
- *
- * This is currently done by walking the list of writepage requests
- * for the inode, which can be pretty inefficient.
- */
-static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
- pgoff_t idx_to)
+static struct fuse_req *fuse_find_writeback(struct fuse_inode *fi,
+ pgoff_t idx_from, pgoff_t idx_to)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_req *req;
- bool found = false;
- spin_lock(&fc->lock);
list_for_each_entry(req, &fi->writepages, writepages_entry) {
pgoff_t curr_index;
- BUG_ON(req->inode != inode);
+ WARN_ON(get_fuse_inode(req->inode) != fi);
curr_index = req->misc.write.in.offset >> PAGE_SHIFT;
if (idx_from < curr_index + req->num_pages &&
curr_index <= idx_to) {
- found = true;
- break;
+ return req;
}
}
- spin_unlock(&fc->lock);
+ return NULL;
+}
+
+/*
+ * Check if any page in a range is under writeback
+ *
+ * This is currently done by walking the list of writepage requests
+ * for the inode, which can be pretty inefficient.
+ */
+static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
+ pgoff_t idx_to)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ bool found;
+
+ spin_lock(&fi->lock);
+ found = fuse_find_writeback(fi, idx_from, idx_to);
+ spin_unlock(&fi->lock);
return found;
}
@@ -598,9 +608,9 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
- spin_lock(&fc->lock);
- fi->attr_version = ++fc->attr_version;
- spin_unlock(&fc->lock);
+ spin_lock(&fi->lock);
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
+ spin_unlock(&fi->lock);
}
io->iocb->ki_complete(io->iocb, res, 0);
@@ -675,13 +685,13 @@ static void fuse_read_update_size(struct inode *inode, loff_t size,
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
if (attr_ver == fi->attr_version && size < inode->i_size &&
!test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
- fi->attr_version = ++fc->attr_version;
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
i_size_write(inode, size);
}
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
}
static void fuse_short_read(struct fuse_req *req, struct inode *inode,
@@ -919,7 +929,7 @@ out:
return err;
}
-static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
@@ -996,13 +1006,13 @@ bool fuse_write_update_size(struct inode *inode, loff_t pos)
struct fuse_inode *fi = get_fuse_inode(inode);
bool ret = false;
- spin_lock(&fc->lock);
- fi->attr_version = ++fc->attr_version;
+ spin_lock(&fi->lock);
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
if (pos > inode->i_size) {
i_size_write(inode, pos);
ret = true;
}
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
return ret;
}
@@ -1125,9 +1135,6 @@ static ssize_t fuse_perform_write(struct kiocb *iocb,
int err = 0;
ssize_t res = 0;
- if (is_bad_inode(inode))
- return -EIO;
-
if (inode->i_size < pos + iov_iter_count(ii))
set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
@@ -1173,7 +1180,7 @@ static ssize_t fuse_perform_write(struct kiocb *iocb,
return res > 0 ? res : err;
}
-static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -1416,9 +1423,6 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
ssize_t res;
struct inode *inode = file_inode(io->iocb->ki_filp);
- if (is_bad_inode(inode))
- return -EIO;
-
res = fuse_direct_io(io, iter, ppos, 0);
fuse_invalidate_atime(inode);
@@ -1426,10 +1430,21 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
return res;
}
+static ssize_t fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
+
static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
- struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
- return __fuse_direct_read(&io, to, &iocb->ki_pos);
+ ssize_t res;
+
+ if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
+ res = fuse_direct_IO(iocb, to);
+ } else {
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
+
+ res = __fuse_direct_read(&io, to, &iocb->ki_pos);
+ }
+
+ return res;
}
static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
@@ -1438,14 +1453,17 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
ssize_t res;
- if (is_bad_inode(inode))
- return -EIO;
-
/* Don't allow parallel writes to the same file */
inode_lock(inode);
res = generic_write_checks(iocb, from);
- if (res > 0)
- res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE);
+ if (res > 0) {
+ if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
+ res = fuse_direct_IO(iocb, from);
+ } else {
+ res = fuse_direct_io(&io, from, &iocb->ki_pos,
+ FUSE_DIO_WRITE);
+ }
+ }
fuse_invalidate_attr(inode);
if (res > 0)
fuse_write_update_size(inode, iocb->ki_pos);
@@ -1454,6 +1472,34 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
return res;
}
+static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct file *file = iocb->ki_filp;
+ struct fuse_file *ff = file->private_data;
+
+ if (is_bad_inode(file_inode(file)))
+ return -EIO;
+
+ if (!(ff->open_flags & FOPEN_DIRECT_IO))
+ return fuse_cache_read_iter(iocb, to);
+ else
+ return fuse_direct_read_iter(iocb, to);
+}
+
+static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct fuse_file *ff = file->private_data;
+
+ if (is_bad_inode(file_inode(file)))
+ return -EIO;
+
+ if (!(ff->open_flags & FOPEN_DIRECT_IO))
+ return fuse_cache_write_iter(iocb, from);
+ else
+ return fuse_direct_write_iter(iocb, from);
+}
+
static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
{
int i;
@@ -1481,20 +1527,18 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
wake_up(&fi->page_waitq);
}
-/* Called under fc->lock, may release and reacquire it */
+/* Called under fi->lock, may release and reacquire it */
static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req,
loff_t size)
-__releases(fc->lock)
-__acquires(fc->lock)
+__releases(fi->lock)
+__acquires(fi->lock)
{
+ struct fuse_req *aux, *next;
struct fuse_inode *fi = get_fuse_inode(req->inode);
struct fuse_write_in *inarg = &req->misc.write.in;
__u64 data_size = req->num_pages * PAGE_SIZE;
bool queued;
- if (!fc->connected)
- goto out_free;
-
if (inarg->offset + data_size <= size) {
inarg->size = data_size;
} else if (inarg->offset < size) {
@@ -1505,28 +1549,40 @@ __acquires(fc->lock)
}
req->in.args[1].size = inarg->size;
- fi->writectr++;
queued = fuse_request_queue_background(fc, req);
- WARN_ON(!queued);
+ /* Fails on broken connection only */
+ if (unlikely(!queued))
+ goto out_free;
+
+ fi->writectr++;
return;
out_free:
fuse_writepage_finish(fc, req);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
+
+ /* After fuse_writepage_finish() aux request list is private */
+ for (aux = req->misc.write.next; aux; aux = next) {
+ next = aux->misc.write.next;
+ aux->misc.write.next = NULL;
+ fuse_writepage_free(fc, aux);
+ fuse_put_request(fc, aux);
+ }
+
fuse_writepage_free(fc, req);
fuse_put_request(fc, req);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
}
/*
* If fi->writectr is positive (no truncate or fsync going on) send
* all queued writepage requests.
*
- * Called with fc->lock
+ * Called with fi->lock
*/
void fuse_flush_writepages(struct inode *inode)
-__releases(fc->lock)
-__acquires(fc->lock)
+__releases(fi->lock)
+__acquires(fi->lock)
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
@@ -1546,7 +1602,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
struct fuse_inode *fi = get_fuse_inode(inode);
mapping_set_error(inode->i_mapping, req->out.h.error);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
while (req->misc.write.next) {
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_write_in *inarg = &req->misc.write.in;
@@ -1583,7 +1639,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
}
fi->writectr--;
fuse_writepage_finish(fc, req);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
fuse_writepage_free(fc, req);
}
@@ -1592,13 +1648,13 @@ static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc,
{
struct fuse_file *ff = NULL;
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
if (!list_empty(&fi->write_files)) {
ff = list_entry(fi->write_files.next, struct fuse_file,
write_entry);
fuse_file_get(ff);
}
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
return ff;
}
@@ -1669,11 +1725,11 @@ static int fuse_writepage_locked(struct page *page)
inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
list_add(&req->writepages_entry, &fi->writepages);
list_add_tail(&req->list, &fi->queued_writes);
fuse_flush_writepages(inode);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
end_page_writeback(page);
@@ -1722,21 +1778,27 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data)
{
struct fuse_req *req = data->req;
struct inode *inode = data->inode;
- struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
int num_pages = req->num_pages;
int i;
req->ff = fuse_file_get(data->ff);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
list_add_tail(&req->list, &fi->queued_writes);
fuse_flush_writepages(inode);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
for (i = 0; i < num_pages; i++)
end_page_writeback(data->orig_pages[i]);
}
+/*
+ * First recheck under fi->lock if the offending offset is still under
+ * writeback. If yes, then iterate auxiliary write requests, to see if there's
+ * one already added for a page at this offset. If there's none, then insert
+ * this new request onto the auxiliary list, otherwise reuse the existing one by
+ * copying the new page contents over to the old temporary page.
+ */
static bool fuse_writepage_in_flight(struct fuse_req *new_req,
struct page *page)
{
@@ -1744,57 +1806,50 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
struct fuse_inode *fi = get_fuse_inode(new_req->inode);
struct fuse_req *tmp;
struct fuse_req *old_req;
- bool found = false;
- pgoff_t curr_index;
- BUG_ON(new_req->num_pages != 0);
+ WARN_ON(new_req->num_pages != 0);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
list_del(&new_req->writepages_entry);
- list_for_each_entry(old_req, &fi->writepages, writepages_entry) {
- BUG_ON(old_req->inode != new_req->inode);
- curr_index = old_req->misc.write.in.offset >> PAGE_SHIFT;
- if (curr_index <= page->index &&
- page->index < curr_index + old_req->num_pages) {
- found = true;
- break;
- }
- }
- if (!found) {
+ old_req = fuse_find_writeback(fi, page->index, page->index);
+ if (!old_req) {
list_add(&new_req->writepages_entry, &fi->writepages);
- goto out_unlock;
+ spin_unlock(&fi->lock);
+ return false;
}
new_req->num_pages = 1;
- for (tmp = old_req; tmp != NULL; tmp = tmp->misc.write.next) {
- BUG_ON(tmp->inode != new_req->inode);
+ for (tmp = old_req->misc.write.next; tmp; tmp = tmp->misc.write.next) {
+ pgoff_t curr_index;
+
+ WARN_ON(tmp->inode != new_req->inode);
curr_index = tmp->misc.write.in.offset >> PAGE_SHIFT;
- if (tmp->num_pages == 1 &&
- curr_index == page->index) {
- old_req = tmp;
+ if (curr_index == page->index) {
+ WARN_ON(tmp->num_pages != 1);
+ WARN_ON(!test_bit(FR_PENDING, &tmp->flags));
+ swap(tmp->pages[0], new_req->pages[0]);
+ break;
}
}
- if (old_req->num_pages == 1 && test_bit(FR_PENDING, &old_req->flags)) {
- struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host);
+ if (!tmp) {
+ new_req->misc.write.next = old_req->misc.write.next;
+ old_req->misc.write.next = new_req;
+ }
+
+ spin_unlock(&fi->lock);
- copy_highpage(old_req->pages[0], page);
- spin_unlock(&fc->lock);
+ if (tmp) {
+ struct backing_dev_info *bdi = inode_to_bdi(new_req->inode);
dec_wb_stat(&bdi->wb, WB_WRITEBACK);
dec_node_page_state(new_req->pages[0], NR_WRITEBACK_TEMP);
wb_writeout_inc(&bdi->wb);
fuse_writepage_free(fc, new_req);
fuse_request_free(new_req);
- goto out;
- } else {
- new_req->misc.write.next = old_req->misc.write.next;
- old_req->misc.write.next = new_req;
}
-out_unlock:
- spin_unlock(&fc->lock);
-out:
- return found;
+
+ return true;
}
static int fuse_writepages_fill(struct page *page,
@@ -1803,6 +1858,7 @@ static int fuse_writepages_fill(struct page *page,
struct fuse_fill_wb_data *data = _data;
struct fuse_req *req = data->req;
struct inode *inode = data->inode;
+ struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_conn *fc = get_fuse_conn(inode);
struct page *tmp_page;
bool is_writeback;
@@ -1873,9 +1929,9 @@ static int fuse_writepages_fill(struct page *page,
req->end = fuse_writepage_end;
req->inode = inode;
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
list_add(&req->writepages_entry, &fi->writepages);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
data->req = req;
}
@@ -1898,12 +1954,12 @@ static int fuse_writepages_fill(struct page *page,
data->orig_pages[req->num_pages] = page;
/*
- * Protected by fc->lock against concurrent access by
+ * Protected by fi->lock against concurrent access by
* fuse_page_is_writeback().
*/
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
req->num_pages++;
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
out_unlock:
unlock_page(page);
@@ -2087,6 +2143,18 @@ static const struct vm_operations_struct fuse_file_vm_ops = {
static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
{
+ struct fuse_file *ff = file->private_data;
+
+ if (ff->open_flags & FOPEN_DIRECT_IO) {
+ /* Can't provide the coherency needed for MAP_SHARED */
+ if (vma->vm_flags & VM_MAYSHARE)
+ return -ENODEV;
+
+ invalidate_inode_pages2(file->f_mapping);
+
+ return generic_file_mmap(file, vma);
+ }
+
if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
fuse_link_write_file(file);
@@ -2095,17 +2163,6 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
-static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma)
-{
- /* Can't provide the coherency needed for MAP_SHARED */
- if (vma->vm_flags & VM_MAYSHARE)
- return -ENODEV;
-
- invalidate_inode_pages2(file->f_mapping);
-
- return generic_file_mmap(file, vma);
-}
-
static int convert_fuse_file_lock(struct fuse_conn *fc,
const struct fuse_file_lock *ffl,
struct file_lock *fl)
@@ -3114,6 +3171,7 @@ static const struct file_operations fuse_file_operations = {
.lock = fuse_file_lock,
.flock = fuse_file_flock,
.splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
.unlocked_ioctl = fuse_file_ioctl,
.compat_ioctl = fuse_file_compat_ioctl,
.poll = fuse_file_poll,
@@ -3121,24 +3179,6 @@ static const struct file_operations fuse_file_operations = {
.copy_file_range = fuse_copy_file_range,
};
-static const struct file_operations fuse_direct_io_file_operations = {
- .llseek = fuse_file_llseek,
- .read_iter = fuse_direct_read_iter,
- .write_iter = fuse_direct_write_iter,
- .mmap = fuse_direct_mmap,
- .open = fuse_open,
- .flush = fuse_flush,
- .release = fuse_release,
- .fsync = fuse_fsync,
- .lock = fuse_file_lock,
- .flock = fuse_file_flock,
- .unlocked_ioctl = fuse_file_ioctl,
- .compat_ioctl = fuse_file_compat_ioctl,
- .poll = fuse_file_poll,
- .fallocate = fuse_file_fallocate,
- /* no splice_read */
-};
-
static const struct address_space_operations fuse_file_aops = {
.readpage = fuse_readpage,
.writepage = fuse_writepage,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 2f2c92e6f8cb..0920c0c032a0 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -96,7 +96,7 @@ struct fuse_inode {
union {
/* Write related fields (regular file only) */
struct {
- /* Files usable in writepage. Protected by fc->lock */
+ /* Files usable in writepage. Protected by fi->lock */
struct list_head write_files;
/* Writepages pending on truncate or fsync */
@@ -144,6 +144,9 @@ struct fuse_inode {
/** Lock for serializing lookup and readdir for back compatibility*/
struct mutex mutex;
+
+ /** Lock to protect write related fields */
+ spinlock_t lock;
};
/** FUSE inode state bits */
@@ -163,7 +166,10 @@ struct fuse_file {
/** Fuse connection for this file */
struct fuse_conn *fc;
- /** Request reserved for flush and release */
+ /*
+ * Request reserved for flush and release.
+ * Modified under relative fuse_inode::lock.
+ */
struct fuse_req *reserved_req;
/** Kernel file handle guaranteed to be unique */
@@ -538,7 +544,7 @@ struct fuse_conn {
struct fuse_iqueue iq;
/** The next unique kernel file handle */
- u64 khctr;
+ atomic64_t khctr;
/** rbtree of fuse_files waiting for poll events indexed by ph */
struct rb_root polled_files;
@@ -624,6 +630,9 @@ struct fuse_conn {
/** Is open/release not implemented by fs? */
unsigned no_open:1;
+ /** Is opendir/releasedir not implemented by fs? */
+ unsigned no_opendir:1;
+
/** Is fsync not implemented by fs? */
unsigned no_fsync:1;
@@ -730,7 +739,7 @@ struct fuse_conn {
struct fuse_req *destroy_req;
/** Version counter for attribute changes */
- u64 attr_version;
+ atomic64_t attr_version;
/** Called on final put */
void (*release)(struct fuse_conn *);
@@ -770,6 +779,11 @@ static inline int invalid_nodeid(u64 nodeid)
return !nodeid || nodeid == FUSE_ROOT_ID;
}
+static inline u64 fuse_get_attr_version(struct fuse_conn *fc)
+{
+ return atomic64_read(&fc->attr_version);
+}
+
/** Device operations */
extern const struct file_operations fuse_dev_operations;
@@ -817,7 +831,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc);
void fuse_file_free(struct fuse_file *ff);
void fuse_finish_open(struct inode *inode, struct file *file);
-void fuse_sync_release(struct fuse_file *ff, int flags);
+void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, int flags);
/**
* Send RELEASE or RELEASEDIR request
@@ -936,7 +950,7 @@ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req);
/* Abort all requests */
-void fuse_abort_conn(struct fuse_conn *fc, bool is_abort);
+void fuse_abort_conn(struct fuse_conn *fc);
void fuse_wait_aborted(struct fuse_conn *fc);
/**
@@ -1000,8 +1014,6 @@ void fuse_flush_writepages(struct inode *inode);
void fuse_set_nowrite(struct inode *inode);
void fuse_release_nowrite(struct inode *inode);
-u64 fuse_get_attr_version(struct fuse_conn *fc);
-
/**
* File-system tells the kernel to invalidate cache for the given node id.
*/
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index c2d4099429be..ec5d9953dfb6 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -97,6 +97,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
fi->orig_ino = 0;
fi->state = 0;
mutex_init(&fi->mutex);
+ spin_lock_init(&fi->lock);
fi->forget = fuse_alloc_forget();
if (!fi->forget) {
kmem_cache_free(fuse_inode_cachep, inode);
@@ -163,7 +164,9 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
- fi->attr_version = ++fc->attr_version;
+ lockdep_assert_held(&fi->lock);
+
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
fi->i_time = attr_valid;
WRITE_ONCE(fi->inval_mask, 0);
@@ -209,10 +212,10 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
loff_t oldsize;
struct timespec64 old_mtime;
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
if ((attr_version != 0 && fi->attr_version > attr_version) ||
test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
return;
}
@@ -227,7 +230,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
*/
if (!is_wb || !S_ISREG(inode->i_mode))
i_size_write(inode, attr->size);
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
if (!is_wb && S_ISREG(inode->i_mode)) {
bool inval = false;
@@ -322,9 +325,9 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
}
fi = get_fuse_inode(inode);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
fi->nlookup++;
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
fuse_change_attributes(inode, attr, attr_valid, attr_version);
return inode;
@@ -376,7 +379,7 @@ void fuse_unlock_inode(struct inode *inode, bool locked)
static void fuse_umount_begin(struct super_block *sb)
{
- fuse_abort_conn(get_fuse_conn_super(sb), false);
+ fuse_abort_conn(get_fuse_conn_super(sb));
}
static void fuse_send_destroy(struct fuse_conn *fc)
@@ -619,12 +622,12 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns)
atomic_set(&fc->num_waiting, 0);
fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
- fc->khctr = 0;
+ atomic64_set(&fc->khctr, 0);
fc->polled_files = RB_ROOT;
fc->blocked = 0;
fc->initialized = 0;
fc->connected = 1;
- fc->attr_version = 1;
+ atomic64_set(&fc->attr_version, 1);
get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
fc->pid_ns = get_pid_ns(task_active_pid_ns(current));
fc->user_ns = get_user_ns(user_ns);
@@ -969,7 +972,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO |
FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT |
FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL |
- FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS;
+ FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS |
+ FUSE_NO_OPENDIR_SUPPORT;
req->in.h.opcode = FUSE_INIT;
req->in.numargs = 1;
req->in.args[0].size = sizeof(*arg);
@@ -1010,7 +1014,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
if (err)
return err;
- sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
/* fuse does it's own writeback accounting */
sb->s_bdi->capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
@@ -1242,7 +1246,7 @@ static void fuse_sb_destroy(struct super_block *sb)
if (fc) {
fuse_send_destroy(fc);
- fuse_abort_conn(fc, false);
+ fuse_abort_conn(fc);
fuse_wait_aborted(fc);
down_write(&fc->killsb);
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index ab18b78f4755..574d03f8a573 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -213,9 +213,9 @@ retry:
}
fi = get_fuse_inode(inode);
- spin_lock(&fc->lock);
+ spin_lock(&fi->lock);
fi->nlookup++;
- spin_unlock(&fc->lock);
+ spin_unlock(&fi->lock);
forget_all_cached_acls(inode);
fuse_change_attributes(inode, &o->attr,
diff --git a/fs/hpfs/hpfs.h b/fs/hpfs/hpfs.h
index 823a328791c0..302f45101a96 100644
--- a/fs/hpfs/hpfs.h
+++ b/fs/hpfs/hpfs.h
@@ -120,11 +120,11 @@ struct hpfs_spare_block
u8 bad_sector: 1; /* bad sector, corrupted disk (???) */
u8 bad_bitmap: 1; /* bad bitmap */
u8 fast: 1; /* partition was fast formatted */
- u8 old_wrote: 1; /* old version wrote to partion */
- u8 old_wrote_1: 1; /* old version wrote to partion (?) */
+ u8 old_wrote: 1; /* old version wrote to partition */
+ u8 old_wrote_1: 1; /* old version wrote to partition (?) */
#else
- u8 old_wrote_1: 1; /* old version wrote to partion (?) */
- u8 old_wrote: 1; /* old version wrote to partion */
+ u8 old_wrote_1: 1; /* old version wrote to partition (?) */
+ u8 old_wrote: 1; /* old version wrote to partition */
u8 fast: 1; /* partition was fast formatted */
u8 bad_bitmap: 1; /* bad bitmap */
u8 bad_sector: 1; /* bad sector, corrupted disk (???) */
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index b0eef008de67..ec32fece5e1e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -27,7 +27,7 @@
#include <linux/backing-dev.h>
#include <linux/hugetlb.h>
#include <linux/pagevec.h>
-#include <linux/parser.h>
+#include <linux/fs_parser.h>
#include <linux/mman.h>
#include <linux/slab.h>
#include <linux/dnotify.h>
@@ -45,11 +45,17 @@ const struct file_operations hugetlbfs_file_operations;
static const struct inode_operations hugetlbfs_dir_inode_operations;
static const struct inode_operations hugetlbfs_inode_operations;
-struct hugetlbfs_config {
+enum hugetlbfs_size_type { NO_SIZE, SIZE_STD, SIZE_PERCENT };
+
+struct hugetlbfs_fs_context {
struct hstate *hstate;
+ unsigned long long max_size_opt;
+ unsigned long long min_size_opt;
long max_hpages;
long nr_inodes;
long min_hpages;
+ enum hugetlbfs_size_type max_val_type;
+ enum hugetlbfs_size_type min_val_type;
kuid_t uid;
kgid_t gid;
umode_t mode;
@@ -57,22 +63,30 @@ struct hugetlbfs_config {
int sysctl_hugetlb_shm_group;
-enum {
- Opt_size, Opt_nr_inodes,
- Opt_mode, Opt_uid, Opt_gid,
- Opt_pagesize, Opt_min_size,
- Opt_err,
+enum hugetlb_param {
+ Opt_gid,
+ Opt_min_size,
+ Opt_mode,
+ Opt_nr_inodes,
+ Opt_pagesize,
+ Opt_size,
+ Opt_uid,
};
-static const match_table_t tokens = {
- {Opt_size, "size=%s"},
- {Opt_nr_inodes, "nr_inodes=%s"},
- {Opt_mode, "mode=%o"},
- {Opt_uid, "uid=%u"},
- {Opt_gid, "gid=%u"},
- {Opt_pagesize, "pagesize=%s"},
- {Opt_min_size, "min_size=%s"},
- {Opt_err, NULL},
+static const struct fs_parameter_spec hugetlb_param_specs[] = {
+ fsparam_u32 ("gid", Opt_gid),
+ fsparam_string("min_size", Opt_min_size),
+ fsparam_u32 ("mode", Opt_mode),
+ fsparam_string("nr_inodes", Opt_nr_inodes),
+ fsparam_string("pagesize", Opt_pagesize),
+ fsparam_string("size", Opt_size),
+ fsparam_u32 ("uid", Opt_uid),
+ {}
+};
+
+static const struct fs_parameter_description hugetlb_fs_parameters = {
+ .name = "hugetlbfs",
+ .specs = hugetlb_param_specs,
};
#ifdef CONFIG_NUMA
@@ -708,16 +722,16 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
}
static struct inode *hugetlbfs_get_root(struct super_block *sb,
- struct hugetlbfs_config *config)
+ struct hugetlbfs_fs_context *ctx)
{
struct inode *inode;
inode = new_inode(sb);
if (inode) {
inode->i_ino = get_next_ino();
- inode->i_mode = S_IFDIR | config->mode;
- inode->i_uid = config->uid;
- inode->i_gid = config->gid;
+ inode->i_mode = S_IFDIR | ctx->mode;
+ inode->i_uid = ctx->uid;
+ inode->i_gid = ctx->gid;
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
inode->i_op = &hugetlbfs_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
@@ -1093,8 +1107,6 @@ static const struct super_operations hugetlbfs_ops = {
.show_options = hugetlbfs_show_options,
};
-enum hugetlbfs_size_type { NO_SIZE, SIZE_STD, SIZE_PERCENT };
-
/*
* Convert size option passed from command line to number of huge pages
* in the pool specified by hstate. Size option could be in bytes
@@ -1117,170 +1129,151 @@ hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt,
return size_opt;
}
-static int
-hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
+/*
+ * Parse one mount parameter.
+ */
+static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *p, *rest;
- substring_t args[MAX_OPT_ARGS];
- int option;
- unsigned long long max_size_opt = 0, min_size_opt = 0;
- enum hugetlbfs_size_type max_val_type = NO_SIZE, min_val_type = NO_SIZE;
-
- if (!options)
+ struct hugetlbfs_fs_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ char *rest;
+ unsigned long ps;
+ int opt;
+
+ opt = fs_parse(fc, &hugetlb_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_uid:
+ ctx->uid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(ctx->uid))
+ goto bad_val;
return 0;
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
- if (!*p)
- continue;
+ case Opt_gid:
+ ctx->gid = make_kgid(current_user_ns(), result.uint_32);
+ if (!gid_valid(ctx->gid))
+ goto bad_val;
+ return 0;
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_uid:
- if (match_int(&args[0], &option))
- goto bad_val;
- pconfig->uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(pconfig->uid))
- goto bad_val;
- break;
+ case Opt_mode:
+ ctx->mode = result.uint_32 & 01777U;
+ return 0;
- case Opt_gid:
- if (match_int(&args[0], &option))
- goto bad_val;
- pconfig->gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(pconfig->gid))
- goto bad_val;
- break;
+ case Opt_size:
+ /* memparse() will accept a K/M/G without a digit */
+ if (!isdigit(param->string[0]))
+ goto bad_val;
+ ctx->max_size_opt = memparse(param->string, &rest);
+ ctx->max_val_type = SIZE_STD;
+ if (*rest == '%')
+ ctx->max_val_type = SIZE_PERCENT;
+ return 0;
- case Opt_mode:
- if (match_octal(&args[0], &option))
- goto bad_val;
- pconfig->mode = option & 01777U;
- break;
+ case Opt_nr_inodes:
+ /* memparse() will accept a K/M/G without a digit */
+ if (!isdigit(param->string[0]))
+ goto bad_val;
+ ctx->nr_inodes = memparse(param->string, &rest);
+ return 0;
- case Opt_size: {
- /* memparse() will accept a K/M/G without a digit */
- if (!isdigit(*args[0].from))
- goto bad_val;
- max_size_opt = memparse(args[0].from, &rest);
- max_val_type = SIZE_STD;
- if (*rest == '%')
- max_val_type = SIZE_PERCENT;
- break;
+ case Opt_pagesize:
+ ps = memparse(param->string, &rest);
+ ctx->hstate = size_to_hstate(ps);
+ if (!ctx->hstate) {
+ pr_err("Unsupported page size %lu MB\n", ps >> 20);
+ return -EINVAL;
}
+ return 0;
- case Opt_nr_inodes:
- /* memparse() will accept a K/M/G without a digit */
- if (!isdigit(*args[0].from))
- goto bad_val;
- pconfig->nr_inodes = memparse(args[0].from, &rest);
- break;
+ case Opt_min_size:
+ /* memparse() will accept a K/M/G without a digit */
+ if (!isdigit(param->string[0]))
+ goto bad_val;
+ ctx->min_size_opt = memparse(param->string, &rest);
+ ctx->min_val_type = SIZE_STD;
+ if (*rest == '%')
+ ctx->min_val_type = SIZE_PERCENT;
+ return 0;
- case Opt_pagesize: {
- unsigned long ps;
- ps = memparse(args[0].from, &rest);
- pconfig->hstate = size_to_hstate(ps);
- if (!pconfig->hstate) {
- pr_err("Unsupported page size %lu MB\n",
- ps >> 20);
- return -EINVAL;
- }
- break;
- }
+ default:
+ return -EINVAL;
+ }
- case Opt_min_size: {
- /* memparse() will accept a K/M/G without a digit */
- if (!isdigit(*args[0].from))
- goto bad_val;
- min_size_opt = memparse(args[0].from, &rest);
- min_val_type = SIZE_STD;
- if (*rest == '%')
- min_val_type = SIZE_PERCENT;
- break;
- }
+bad_val:
+ return invalf(fc, "hugetlbfs: Bad value '%s' for mount option '%s'\n",
+ param->string, param->key);
+}
- default:
- pr_err("Bad mount option: \"%s\"\n", p);
- return -EINVAL;
- break;
- }
- }
+/*
+ * Validate the parsed options.
+ */
+static int hugetlbfs_validate(struct fs_context *fc)
+{
+ struct hugetlbfs_fs_context *ctx = fc->fs_private;
/*
* Use huge page pool size (in hstate) to convert the size
* options to number of huge pages. If NO_SIZE, -1 is returned.
*/
- pconfig->max_hpages = hugetlbfs_size_to_hpages(pconfig->hstate,
- max_size_opt, max_val_type);
- pconfig->min_hpages = hugetlbfs_size_to_hpages(pconfig->hstate,
- min_size_opt, min_val_type);
+ ctx->max_hpages = hugetlbfs_size_to_hpages(ctx->hstate,
+ ctx->max_size_opt,
+ ctx->max_val_type);
+ ctx->min_hpages = hugetlbfs_size_to_hpages(ctx->hstate,
+ ctx->min_size_opt,
+ ctx->min_val_type);
/*
* If max_size was specified, then min_size must be smaller
*/
- if (max_val_type > NO_SIZE &&
- pconfig->min_hpages > pconfig->max_hpages) {
- pr_err("minimum size can not be greater than maximum size\n");
+ if (ctx->max_val_type > NO_SIZE &&
+ ctx->min_hpages > ctx->max_hpages) {
+ pr_err("Minimum size can not be greater than maximum size\n");
return -EINVAL;
}
return 0;
-
-bad_val:
- pr_err("Bad value '%s' for mount option '%s'\n", args[0].from, p);
- return -EINVAL;
}
static int
-hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
+hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
- int ret;
- struct hugetlbfs_config config;
+ struct hugetlbfs_fs_context *ctx = fc->fs_private;
struct hugetlbfs_sb_info *sbinfo;
- config.max_hpages = -1; /* No limit on size by default */
- config.nr_inodes = -1; /* No limit on number of inodes by default */
- config.uid = current_fsuid();
- config.gid = current_fsgid();
- config.mode = 0755;
- config.hstate = &default_hstate;
- config.min_hpages = -1; /* No default minimum size */
- ret = hugetlbfs_parse_options(data, &config);
- if (ret)
- return ret;
-
sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL);
if (!sbinfo)
return -ENOMEM;
sb->s_fs_info = sbinfo;
- sbinfo->hstate = config.hstate;
spin_lock_init(&sbinfo->stat_lock);
- sbinfo->max_inodes = config.nr_inodes;
- sbinfo->free_inodes = config.nr_inodes;
- sbinfo->spool = NULL;
- sbinfo->uid = config.uid;
- sbinfo->gid = config.gid;
- sbinfo->mode = config.mode;
+ sbinfo->hstate = ctx->hstate;
+ sbinfo->max_inodes = ctx->nr_inodes;
+ sbinfo->free_inodes = ctx->nr_inodes;
+ sbinfo->spool = NULL;
+ sbinfo->uid = ctx->uid;
+ sbinfo->gid = ctx->gid;
+ sbinfo->mode = ctx->mode;
/*
* Allocate and initialize subpool if maximum or minimum size is
* specified. Any needed reservations (for minimim size) are taken
* taken when the subpool is created.
*/
- if (config.max_hpages != -1 || config.min_hpages != -1) {
- sbinfo->spool = hugepage_new_subpool(config.hstate,
- config.max_hpages,
- config.min_hpages);
+ if (ctx->max_hpages != -1 || ctx->min_hpages != -1) {
+ sbinfo->spool = hugepage_new_subpool(ctx->hstate,
+ ctx->max_hpages,
+ ctx->min_hpages);
if (!sbinfo->spool)
goto out_free;
}
sb->s_maxbytes = MAX_LFS_FILESIZE;
- sb->s_blocksize = huge_page_size(config.hstate);
- sb->s_blocksize_bits = huge_page_shift(config.hstate);
+ sb->s_blocksize = huge_page_size(ctx->hstate);
+ sb->s_blocksize_bits = huge_page_shift(ctx->hstate);
sb->s_magic = HUGETLBFS_MAGIC;
sb->s_op = &hugetlbfs_ops;
sb->s_time_gran = 1;
- sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config));
+ sb->s_root = d_make_root(hugetlbfs_get_root(sb, ctx));
if (!sb->s_root)
goto out_free;
return 0;
@@ -1290,16 +1283,52 @@ out_free:
return -ENOMEM;
}
-static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int hugetlbfs_get_tree(struct fs_context *fc)
+{
+ int err = hugetlbfs_validate(fc);
+ if (err)
+ return err;
+ return vfs_get_super(fc, vfs_get_independent_super, hugetlbfs_fill_super);
+}
+
+static void hugetlbfs_fs_context_free(struct fs_context *fc)
+{
+ kfree(fc->fs_private);
+}
+
+static const struct fs_context_operations hugetlbfs_fs_context_ops = {
+ .free = hugetlbfs_fs_context_free,
+ .parse_param = hugetlbfs_parse_param,
+ .get_tree = hugetlbfs_get_tree,
+};
+
+static int hugetlbfs_init_fs_context(struct fs_context *fc)
{
- return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super);
+ struct hugetlbfs_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct hugetlbfs_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->max_hpages = -1; /* No limit on size by default */
+ ctx->nr_inodes = -1; /* No limit on number of inodes by default */
+ ctx->uid = current_fsuid();
+ ctx->gid = current_fsgid();
+ ctx->mode = 0755;
+ ctx->hstate = &default_hstate;
+ ctx->min_hpages = -1; /* No default minimum size */
+ ctx->max_val_type = NO_SIZE;
+ ctx->min_val_type = NO_SIZE;
+ fc->fs_private = ctx;
+ fc->ops = &hugetlbfs_fs_context_ops;
+ return 0;
}
static struct file_system_type hugetlbfs_fs_type = {
- .name = "hugetlbfs",
- .mount = hugetlbfs_mount,
- .kill_sb = kill_litter_super,
+ .name = "hugetlbfs",
+ .init_fs_context = hugetlbfs_init_fs_context,
+ .parameters = &hugetlb_fs_parameters,
+ .kill_sb = kill_litter_super,
};
static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
@@ -1384,8 +1413,29 @@ out:
return file;
}
+static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h)
+{
+ struct fs_context *fc;
+ struct vfsmount *mnt;
+
+ fc = fs_context_for_mount(&hugetlbfs_fs_type, SB_KERNMOUNT);
+ if (IS_ERR(fc)) {
+ mnt = ERR_CAST(fc);
+ } else {
+ struct hugetlbfs_fs_context *ctx = fc->fs_private;
+ ctx->hstate = h;
+ mnt = fc_mount(fc);
+ put_fs_context(fc);
+ }
+ if (IS_ERR(mnt))
+ pr_err("Cannot mount internal hugetlbfs for page size %uK",
+ 1U << (h->order + PAGE_SHIFT - 10));
+ return mnt;
+}
+
static int __init init_hugetlbfs_fs(void)
{
+ struct vfsmount *mnt;
struct hstate *h;
int error;
int i;
@@ -1408,24 +1458,16 @@ static int __init init_hugetlbfs_fs(void)
i = 0;
for_each_hstate(h) {
- char buf[50];
- unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10);
-
- snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb);
- hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type,
- buf);
-
- if (IS_ERR(hugetlbfs_vfsmount[i])) {
- pr_err("Cannot mount internal hugetlbfs for "
- "page size %uK", ps_kb);
- error = PTR_ERR(hugetlbfs_vfsmount[i]);
- hugetlbfs_vfsmount[i] = NULL;
+ mnt = mount_one_hugetlbfs(h);
+ if (IS_ERR(mnt) && i == 0) {
+ error = PTR_ERR(mnt);
+ goto out;
}
+ hugetlbfs_vfsmount[i] = mnt;
i++;
}
- /* Non default hstates are optional */
- if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx]))
- return 0;
+
+ return 0;
out:
kmem_cache_destroy(hugetlbfs_inode_cachep);
diff --git a/fs/internal.h b/fs/internal.h
index d410186bc369..6a8b71643af4 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -17,6 +17,7 @@ struct linux_binprm;
struct path;
struct mount;
struct shrink_control;
+struct fs_context;
/*
* block_dev.c
@@ -52,8 +53,16 @@ int __generic_write_end(struct inode *inode, loff_t pos, unsigned copied,
extern void __init chrdev_init(void);
/*
+ * fs_context.c
+ */
+extern int parse_monolithic_mount_data(struct fs_context *, void *);
+extern void fc_drop_locked(struct fs_context *);
+
+/*
* namei.c
*/
+extern int filename_lookup(int dfd, struct filename *name, unsigned flags,
+ struct path *path, struct path *root);
extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *);
extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
const char *, unsigned int, struct path *);
@@ -99,10 +108,8 @@ extern struct file *alloc_empty_file_noaccount(int, const struct cred *);
/*
* super.c
*/
-extern int do_remount_sb(struct super_block *, int, void *, int);
+extern int reconfigure_super(struct fs_context *);
extern bool trylock_super(struct super_block *sb);
-extern struct dentry *mount_fs(struct file_system_type *,
- int, const char *, void *);
extern struct super_block *user_get_super(dev_t);
/*
diff --git a/fs/io_uring.c b/fs/io_uring.c
index c592a0933b0d..6aaa30580a2b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -917,7 +917,7 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
/* Use 8x RA size as a decent limiter for both reads/writes */
max_pages = filp->f_ra.ra_pages;
if (!max_pages)
- max_pages = VM_MAX_READAHEAD >> (PAGE_SHIFT - 10);
+ max_pages = VM_READAHEAD_PAGES;
max_pages *= 8;
/* If max pages are exceeded, reset the state */
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 26f8d7e46462..02e0b79753e7 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -113,7 +113,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
nblocks = jbd2_space_needed(journal);
while (jbd2_log_space_left(journal) < nblocks) {
write_unlock(&journal->j_state_lock);
- mutex_lock(&journal->j_checkpoint_mutex);
+ mutex_lock_io(&journal->j_checkpoint_mutex);
/*
* Test again, another process may have checkpointed while we
@@ -276,9 +276,22 @@ restart:
"JBD2: %s: Waiting for Godot: block %llu\n",
journal->j_devname, (unsigned long long) bh->b_blocknr);
+ if (batch_count)
+ __flush_batch(journal, &batch_count);
jbd2_log_start_commit(journal, tid);
+ /*
+ * jbd2_journal_commit_transaction() may want
+ * to take the checkpoint_mutex if JBD2_FLUSHED
+ * is set, jbd2_update_log_tail() called by
+ * jbd2_journal_commit_transaction() may also take
+ * checkpoint_mutex. So we need to temporarily
+ * drop it.
+ */
+ mutex_unlock(&journal->j_checkpoint_mutex);
jbd2_log_wait_commit(journal, tid);
- goto retry;
+ mutex_lock_io(&journal->j_checkpoint_mutex);
+ spin_lock(&journal->j_list_lock);
+ goto restart;
}
if (!buffer_dirty(bh)) {
if (unlikely(buffer_write_io_error(bh)) && !result)
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 2eb55c3361a8..efd0ce9489ae 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -694,9 +694,11 @@ void jbd2_journal_commit_transaction(journal_t *journal)
the last tag we set up. */
tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
-
- jbd2_descriptor_block_csum_set(journal, descriptor);
start_journal_io:
+ if (descriptor)
+ jbd2_descriptor_block_csum_set(journal,
+ descriptor);
+
for (i = 0; i < bufs; i++) {
struct buffer_head *bh = wbuf[i];
/*
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 8ef6b6daaa7a..382c030cc78b 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -142,22 +142,6 @@ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
return cpu_to_be32(csum);
}
-static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
-{
- if (!jbd2_journal_has_csum_v2or3(j))
- return 1;
-
- return sb->s_checksum == jbd2_superblock_csum(j, sb);
-}
-
-static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
-{
- if (!jbd2_journal_has_csum_v2or3(j))
- return;
-
- sb->s_checksum = jbd2_superblock_csum(j, sb);
-}
-
/*
* Helper function used to manage commit timeouts
*/
@@ -1356,6 +1340,10 @@ static int journal_reset(journal_t *journal)
return jbd2_journal_start_thread(journal);
}
+/*
+ * This function expects that the caller will have locked the journal
+ * buffer head, and will return with it unlocked
+ */
static int jbd2_write_superblock(journal_t *journal, int write_flags)
{
struct buffer_head *bh = journal->j_sb_buffer;
@@ -1365,7 +1353,6 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags)
trace_jbd2_write_superblock(journal, write_flags);
if (!(journal->j_flags & JBD2_BARRIER))
write_flags &= ~(REQ_FUA | REQ_PREFLUSH);
- lock_buffer(bh);
if (buffer_write_io_error(bh)) {
/*
* Oh, dear. A previous attempt to write the journal
@@ -1381,7 +1368,8 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags)
clear_buffer_write_io_error(bh);
set_buffer_uptodate(bh);
}
- jbd2_superblock_csum_set(journal, sb);
+ if (jbd2_journal_has_csum_v2or3(journal))
+ sb->s_checksum = jbd2_superblock_csum(journal, sb);
get_bh(bh);
bh->b_end_io = end_buffer_write_sync;
ret = submit_bh(REQ_OP_WRITE, write_flags, bh);
@@ -1424,6 +1412,7 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
tail_block, tail_tid);
+ lock_buffer(journal->j_sb_buffer);
sb->s_sequence = cpu_to_be32(tail_tid);
sb->s_start = cpu_to_be32(tail_block);
@@ -1454,18 +1443,17 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
journal_superblock_t *sb = journal->j_superblock;
BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
- read_lock(&journal->j_state_lock);
- /* Is it already empty? */
- if (sb->s_start == 0) {
- read_unlock(&journal->j_state_lock);
+ lock_buffer(journal->j_sb_buffer);
+ if (sb->s_start == 0) { /* Is it already empty? */
+ unlock_buffer(journal->j_sb_buffer);
return;
}
+
jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n",
journal->j_tail_sequence);
sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
sb->s_start = cpu_to_be32(0);
- read_unlock(&journal->j_state_lock);
jbd2_write_superblock(journal, write_op);
@@ -1488,9 +1476,8 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
journal_superblock_t *sb = journal->j_superblock;
int errcode;
- read_lock(&journal->j_state_lock);
+ lock_buffer(journal->j_sb_buffer);
errcode = journal->j_errno;
- read_unlock(&journal->j_state_lock);
if (errcode == -ESHUTDOWN)
errcode = 0;
jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode);
@@ -1595,17 +1582,18 @@ static int journal_get_superblock(journal_t *journal)
}
}
- /* Check superblock checksum */
- if (!jbd2_superblock_csum_verify(journal, sb)) {
- printk(KERN_ERR "JBD2: journal checksum error\n");
- err = -EFSBADCRC;
- goto out;
- }
+ if (jbd2_journal_has_csum_v2or3(journal)) {
+ /* Check superblock checksum */
+ if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) {
+ printk(KERN_ERR "JBD2: journal checksum error\n");
+ err = -EFSBADCRC;
+ goto out;
+ }
- /* Precompute checksum seed for all metadata */
- if (jbd2_journal_has_csum_v2or3(journal))
+ /* Precompute checksum seed for all metadata */
journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
sizeof(sb->s_uuid));
+ }
set_buffer_verified(bh);
@@ -1894,28 +1882,27 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
sb = journal->j_superblock;
+ /* Load the checksum driver if necessary */
+ if ((journal->j_chksum_driver == NULL) &&
+ INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
+ journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
+ if (IS_ERR(journal->j_chksum_driver)) {
+ printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
+ journal->j_chksum_driver = NULL;
+ return 0;
+ }
+ /* Precompute checksum seed for all metadata */
+ journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
+ sizeof(sb->s_uuid));
+ }
+
+ lock_buffer(journal->j_sb_buffer);
+
/* If enabling v3 checksums, update superblock */
if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
sb->s_checksum_type = JBD2_CRC32C_CHKSUM;
sb->s_feature_compat &=
~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM);
-
- /* Load the checksum driver */
- if (journal->j_chksum_driver == NULL) {
- journal->j_chksum_driver = crypto_alloc_shash("crc32c",
- 0, 0);
- if (IS_ERR(journal->j_chksum_driver)) {
- printk(KERN_ERR "JBD2: Cannot load crc32c "
- "driver.\n");
- journal->j_chksum_driver = NULL;
- return 0;
- }
-
- /* Precompute checksum seed for all metadata */
- journal->j_csum_seed = jbd2_chksum(journal, ~0,
- sb->s_uuid,
- sizeof(sb->s_uuid));
- }
}
/* If enabling v1 checksums, downgrade superblock */
@@ -1927,6 +1914,7 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
sb->s_feature_compat |= cpu_to_be32(compat);
sb->s_feature_ro_compat |= cpu_to_be32(ro);
sb->s_feature_incompat |= cpu_to_be32(incompat);
+ unlock_buffer(journal->j_sb_buffer);
return 1;
#undef COMPAT_FEATURE_ON
@@ -2067,7 +2055,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
err = jbd2_journal_skip_recovery(journal);
if (write) {
/* Lock to make assertions happy... */
- mutex_lock(&journal->j_checkpoint_mutex);
+ mutex_lock_io(&journal->j_checkpoint_mutex);
jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
}
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index cc35537232f2..f940d31c2adc 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -63,7 +63,7 @@ void jbd2_journal_free_transaction(transaction_t *transaction)
/*
* jbd2_get_transaction: obtain a new transaction_t object.
*
- * Simply allocate and initialise a new transaction. Create it in
+ * Simply initialise a new transaction. Initialize it in
* RUNNING state and add it to the current journal (which should not
* have an existing running transaction: we only make a new transaction
* once we have started to commit the old one).
@@ -75,8 +75,8 @@ void jbd2_journal_free_transaction(transaction_t *transaction)
*
*/
-static transaction_t *
-jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
+static void jbd2_get_transaction(journal_t *journal,
+ transaction_t *transaction)
{
transaction->t_journal = journal;
transaction->t_state = T_RUNNING;
@@ -100,8 +100,6 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
transaction->t_max_wait = 0;
transaction->t_start = jiffies;
transaction->t_requested = 0;
-
- return transaction;
}
/*
@@ -1252,11 +1250,12 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
struct journal_head *jh;
char *committed_data = NULL;
- JBUFFER_TRACE(jh, "entry");
if (jbd2_write_access_granted(handle, bh, true))
return 0;
jh = jbd2_journal_add_journal_head(bh);
+ JBUFFER_TRACE(jh, "entry");
+
/*
* Do this first --- it can drop the journal lock, so we want to
* make sure that obtaining the committed_data is done
@@ -1367,15 +1366,17 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
if (is_handle_aborted(handle))
return -EROFS;
- if (!buffer_jbd(bh)) {
- ret = -EUCLEAN;
- goto out;
- }
+ if (!buffer_jbd(bh))
+ return -EUCLEAN;
+
/*
* We don't grab jh reference here since the buffer must be part
* of the running transaction.
*/
jh = bh2jh(bh);
+ jbd_debug(5, "journal_head %p\n", jh);
+ JBUFFER_TRACE(jh, "entry");
+
/*
* This and the following assertions are unreliable since we may see jh
* in inconsistent state unless we grab bh_state lock. But this is
@@ -1409,9 +1410,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
}
journal = transaction->t_journal;
- jbd_debug(5, "journal_head %p\n", jh);
- JBUFFER_TRACE(jh, "entry");
-
jbd_lock_bh_state(bh);
if (jh->b_modified == 0) {
@@ -1597,9 +1595,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
__jbd2_journal_unfile_buffer(jh);
if (!buffer_jbd(bh)) {
spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- __bforget(bh);
- goto drop;
+ goto not_jbd;
}
}
spin_unlock(&journal->j_list_lock);
@@ -1609,14 +1605,21 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
/* However, if the buffer is still owned by a prior
* (committing) transaction, we can't drop it yet... */
JBUFFER_TRACE(jh, "belongs to older transaction");
- /* ... but we CAN drop it from the new transaction if we
- * have also modified it since the original commit. */
+ /* ... but we CAN drop it from the new transaction through
+ * marking the buffer as freed and set j_next_transaction to
+ * the new transaction, so that not only the commit code
+ * knows it should clear dirty bits when it is done with the
+ * buffer, but also the buffer can be checkpointed only
+ * after the new transaction commits. */
- if (jh->b_next_transaction) {
- J_ASSERT(jh->b_next_transaction == transaction);
+ set_buffer_freed(bh);
+
+ if (!jh->b_next_transaction) {
spin_lock(&journal->j_list_lock);
- jh->b_next_transaction = NULL;
+ jh->b_next_transaction = transaction;
spin_unlock(&journal->j_list_lock);
+ } else {
+ J_ASSERT(jh->b_next_transaction == transaction);
/*
* only drop a reference if this transaction modified
@@ -1625,9 +1628,40 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
if (was_modified)
drop_reserve = 1;
}
+ } else {
+ /*
+ * Finally, if the buffer is not belongs to any
+ * transaction, we can just drop it now if it has no
+ * checkpoint.
+ */
+ spin_lock(&journal->j_list_lock);
+ if (!jh->b_cp_transaction) {
+ JBUFFER_TRACE(jh, "belongs to none transaction");
+ spin_unlock(&journal->j_list_lock);
+ goto not_jbd;
+ }
+
+ /*
+ * Otherwise, if the buffer has been written to disk,
+ * it is safe to remove the checkpoint and drop it.
+ */
+ if (!buffer_dirty(bh)) {
+ __jbd2_journal_remove_checkpoint(jh);
+ spin_unlock(&journal->j_list_lock);
+ goto not_jbd;
+ }
+
+ /*
+ * The buffer is still not written to disk, we should
+ * attach this buffer to current transaction so that the
+ * buffer can be checkpointed only after the current
+ * transaction commits.
+ */
+ clear_buffer_dirty(bh);
+ __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
+ spin_unlock(&journal->j_list_lock);
}
-not_jbd:
jbd_unlock_bh_state(bh);
__brelse(bh);
drop:
@@ -1636,6 +1670,11 @@ drop:
handle->h_buffer_credits++;
}
return err;
+
+not_jbd:
+ jbd_unlock_bh_state(bh);
+ __bforget(bh);
+ goto drop;
}
/**
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index dba810cd83b1..0b7d197a904c 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -17,6 +17,7 @@
#include <linux/xattr.h>
#include <linux/kernfs.h>
+#include <linux/fs_context.h>
struct kernfs_iattrs {
struct iattr ia_iattr;
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index f3ac352699cf..9a4646eecb71 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -22,16 +22,6 @@
struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
-static int kernfs_sop_remount_fs(struct super_block *sb, int *flags, char *data)
-{
- struct kernfs_root *root = kernfs_info(sb)->root;
- struct kernfs_syscall_ops *scops = root->syscall_ops;
-
- if (scops && scops->remount_fs)
- return scops->remount_fs(root, flags, data);
- return 0;
-}
-
static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
{
struct kernfs_root *root = kernfs_root(kernfs_dentry_node(dentry));
@@ -60,7 +50,6 @@ const struct super_operations kernfs_sops = {
.drop_inode = generic_delete_inode,
.evict_inode = kernfs_evict_inode,
- .remount_fs = kernfs_sop_remount_fs,
.show_options = kernfs_sop_show_options,
.show_path = kernfs_sop_show_path,
};
@@ -222,7 +211,7 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
} while (true);
}
-static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
+static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *kfc)
{
struct kernfs_super_info *info = kernfs_info(sb);
struct inode *inode;
@@ -233,7 +222,7 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
- sb->s_magic = magic;
+ sb->s_magic = kfc->magic;
sb->s_op = &kernfs_sops;
sb->s_xattr = kernfs_xattr_handlers;
if (info->root->flags & KERNFS_ROOT_SUPPORT_EXPORTOP)
@@ -263,21 +252,20 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
return 0;
}
-static int kernfs_test_super(struct super_block *sb, void *data)
+static int kernfs_test_super(struct super_block *sb, struct fs_context *fc)
{
struct kernfs_super_info *sb_info = kernfs_info(sb);
- struct kernfs_super_info *info = data;
+ struct kernfs_super_info *info = fc->s_fs_info;
return sb_info->root == info->root && sb_info->ns == info->ns;
}
-static int kernfs_set_super(struct super_block *sb, void *data)
+static int kernfs_set_super(struct super_block *sb, struct fs_context *fc)
{
- int error;
- error = set_anon_super(sb, data);
- if (!error)
- sb->s_fs_info = data;
- return error;
+ struct kernfs_fs_context *kfc = fc->fs_private;
+
+ kfc->ns_tag = NULL;
+ return set_anon_super_fc(sb, fc);
}
/**
@@ -294,63 +282,60 @@ const void *kernfs_super_ns(struct super_block *sb)
}
/**
- * kernfs_mount_ns - kernfs mount helper
- * @fs_type: file_system_type of the fs being mounted
- * @flags: mount flags specified for the mount
- * @root: kernfs_root of the hierarchy being mounted
- * @magic: file system specific magic number
- * @new_sb_created: tell the caller if we allocated a new superblock
- * @ns: optional namespace tag of the mount
+ * kernfs_get_tree - kernfs filesystem access/retrieval helper
+ * @fc: The filesystem context.
*
- * This is to be called from each kernfs user's file_system_type->mount()
- * implementation, which should pass through the specified @fs_type and
- * @flags, and specify the hierarchy and namespace tag to mount via @root
- * and @ns, respectively.
- *
- * The return value can be passed to the vfs layer verbatim.
+ * This is to be called from each kernfs user's fs_context->ops->get_tree()
+ * implementation, which should set the specified ->@fs_type and ->@flags, and
+ * specify the hierarchy and namespace tag to mount via ->@root and ->@ns,
+ * respectively.
*/
-struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
- struct kernfs_root *root, unsigned long magic,
- bool *new_sb_created, const void *ns)
+int kernfs_get_tree(struct fs_context *fc)
{
+ struct kernfs_fs_context *kfc = fc->fs_private;
struct super_block *sb;
struct kernfs_super_info *info;
int error;
info = kzalloc(sizeof(*info), GFP_KERNEL);
if (!info)
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
- info->root = root;
- info->ns = ns;
+ info->root = kfc->root;
+ info->ns = kfc->ns_tag;
INIT_LIST_HEAD(&info->node);
- sb = sget_userns(fs_type, kernfs_test_super, kernfs_set_super, flags,
- &init_user_ns, info);
- if (IS_ERR(sb) || sb->s_fs_info != info)
- kfree(info);
+ fc->s_fs_info = info;
+ sb = sget_fc(fc, kernfs_test_super, kernfs_set_super);
if (IS_ERR(sb))
- return ERR_CAST(sb);
-
- if (new_sb_created)
- *new_sb_created = !sb->s_root;
+ return PTR_ERR(sb);
if (!sb->s_root) {
struct kernfs_super_info *info = kernfs_info(sb);
- error = kernfs_fill_super(sb, magic);
+ kfc->new_sb_created = true;
+
+ error = kernfs_fill_super(sb, kfc);
if (error) {
deactivate_locked_super(sb);
- return ERR_PTR(error);
+ return error;
}
sb->s_flags |= SB_ACTIVE;
mutex_lock(&kernfs_mutex);
- list_add(&info->node, &root->supers);
+ list_add(&info->node, &info->root->supers);
mutex_unlock(&kernfs_mutex);
}
- return dget(sb->s_root);
+ fc->root = dget(sb->s_root);
+ return 0;
+}
+
+void kernfs_free_fs_context(struct fs_context *fc)
+{
+ /* Note that we don't deal with kfc->ns_tag here. */
+ kfree(fc->s_fs_info);
+ fc->s_fs_info = NULL;
}
/**
@@ -377,36 +362,6 @@ void kernfs_kill_sb(struct super_block *sb)
kfree(info);
}
-/**
- * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root
- * @kernfs_root: the kernfs_root in question
- * @ns: the namespace tag
- *
- * Pin the superblock so the superblock won't be destroyed in subsequent
- * operations. This can be used to block ->kill_sb() which may be useful
- * for kernfs users which dynamically manage superblocks.
- *
- * Returns NULL if there's no superblock associated to this kernfs_root, or
- * -EINVAL if the superblock is being freed.
- */
-struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns)
-{
- struct kernfs_super_info *info;
- struct super_block *sb = NULL;
-
- mutex_lock(&kernfs_mutex);
- list_for_each_entry(info, &root->supers, node) {
- if (info->ns == ns) {
- sb = info->sb;
- if (!atomic_inc_not_zero(&info->sb->s_active))
- sb = ERR_PTR(-EINVAL);
- break;
- }
- }
- mutex_unlock(&kernfs_mutex);
- return sb;
-}
-
void __init kernfs_init(void)
{
diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
index 214a2fa1f1e3..7df6324ccb8a 100644
--- a/fs/lockd/clnt4xdr.c
+++ b/fs/lockd/clnt4xdr.c
@@ -75,17 +75,6 @@ static void nlm4_compute_offsets(const struct nlm_lock *lock,
}
/*
- * Handle decode buffer overflows out-of-line.
- */
-static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-{
- dprintk("lockd: %s prematurely hit the end of our receive buffer. "
- "Remaining buffer length is %tu words.\n",
- func, xdr->end - xdr->p);
-}
-
-
-/*
* Encode/decode NLMv4 basic data types
*
* Basic NLMv4 data types are defined in Appendix II, section 6.1.4
@@ -176,7 +165,6 @@ out_size:
dprintk("NFS: returned cookie was too long: %u\n", length);
return -EIO;
out_overflow:
- print_overflow_msg(__func__, xdr);
return -EIO;
}
@@ -236,7 +224,6 @@ out_bad_xdr:
__func__, be32_to_cpup(p));
return -EIO;
out_overflow:
- print_overflow_msg(__func__, xdr);
return -EIO;
}
@@ -309,7 +296,6 @@ static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result)
out:
return error;
out_overflow:
- print_overflow_msg(__func__, xdr);
return -EIO;
}
diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
index 747b9c8c940a..4df62f635529 100644
--- a/fs/lockd/clntxdr.c
+++ b/fs/lockd/clntxdr.c
@@ -71,17 +71,6 @@ static void nlm_compute_offsets(const struct nlm_lock *lock,
}
/*
- * Handle decode buffer overflows out-of-line.
- */
-static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-{
- dprintk("lockd: %s prematurely hit the end of our receive buffer. "
- "Remaining buffer length is %tu words.\n",
- func, xdr->end - xdr->p);
-}
-
-
-/*
* Encode/decode NLMv3 basic data types
*
* Basic NLMv3 data types are not defined in an IETF standards
@@ -173,7 +162,6 @@ out_size:
dprintk("NFS: returned cookie was too long: %u\n", length);
return -EIO;
out_overflow:
- print_overflow_msg(__func__, xdr);
return -EIO;
}
@@ -231,7 +219,6 @@ out_enum:
__func__, be32_to_cpup(p));
return -EIO;
out_overflow:
- print_overflow_msg(__func__, xdr);
return -EIO;
}
@@ -303,7 +290,6 @@ static int decode_nlm_holder(struct xdr_stream *xdr, struct nlm_res *result)
out:
return error;
out_overflow:
- print_overflow_msg(__func__, xdr);
return -EIO;
}
diff --git a/fs/mount.h b/fs/mount.h
index f39bc9da4d73..6250de544760 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -146,3 +146,8 @@ static inline bool is_local_mountpoint(struct dentry *dentry)
return __is_local_mountpoint(dentry);
}
+
+static inline bool is_anon_ns(struct mnt_namespace *ns)
+{
+ return ns->seq == 0;
+}
diff --git a/fs/namei.c b/fs/namei.c
index 0a8c5c27f90e..dede0147b3f6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2331,8 +2331,8 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path
return err;
}
-static int filename_lookup(int dfd, struct filename *name, unsigned flags,
- struct path *path, struct path *root)
+int filename_lookup(int dfd, struct filename *name, unsigned flags,
+ struct path *path, struct path *root)
{
int retval;
struct nameidata nd;
@@ -3460,6 +3460,7 @@ struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag)
inode->i_state |= I_LINKABLE;
spin_unlock(&inode->i_lock);
}
+ ima_post_create_tmpfile(inode);
return child;
out_err:
diff --git a/fs/namespace.c b/fs/namespace.c
index 98a8c182af4f..c9cab307fa77 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -27,6 +27,7 @@
#include <linux/task_work.h>
#include <linux/sched/task.h>
#include <uapi/linux/mount.h>
+#include <linux/fs_context.h>
#include "pnode.h"
#include "internal.h"
@@ -940,38 +941,81 @@ static struct mount *skip_mnt_tree(struct mount *p)
return p;
}
-struct vfsmount *
-vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
+/**
+ * vfs_create_mount - Create a mount for a configured superblock
+ * @fc: The configuration context with the superblock attached
+ *
+ * Create a mount to an already configured superblock. If necessary, the
+ * caller should invoke vfs_get_tree() before calling this.
+ *
+ * Note that this does not attach the mount to anything.
+ */
+struct vfsmount *vfs_create_mount(struct fs_context *fc)
{
struct mount *mnt;
- struct dentry *root;
- if (!type)
- return ERR_PTR(-ENODEV);
+ if (!fc->root)
+ return ERR_PTR(-EINVAL);
- mnt = alloc_vfsmnt(name);
+ mnt = alloc_vfsmnt(fc->source ?: "none");
if (!mnt)
return ERR_PTR(-ENOMEM);
- if (flags & SB_KERNMOUNT)
+ if (fc->sb_flags & SB_KERNMOUNT)
mnt->mnt.mnt_flags = MNT_INTERNAL;
- root = mount_fs(type, flags, name, data);
- if (IS_ERR(root)) {
- mnt_free_id(mnt);
- free_vfsmnt(mnt);
- return ERR_CAST(root);
- }
+ atomic_inc(&fc->root->d_sb->s_active);
+ mnt->mnt.mnt_sb = fc->root->d_sb;
+ mnt->mnt.mnt_root = dget(fc->root);
+ mnt->mnt_mountpoint = mnt->mnt.mnt_root;
+ mnt->mnt_parent = mnt;
- mnt->mnt.mnt_root = root;
- mnt->mnt.mnt_sb = root->d_sb;
- mnt->mnt_mountpoint = mnt->mnt.mnt_root;
- mnt->mnt_parent = mnt;
lock_mount_hash();
- list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
+ list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
unlock_mount_hash();
return &mnt->mnt;
}
+EXPORT_SYMBOL(vfs_create_mount);
+
+struct vfsmount *fc_mount(struct fs_context *fc)
+{
+ int err = vfs_get_tree(fc);
+ if (!err) {
+ up_write(&fc->root->d_sb->s_umount);
+ return vfs_create_mount(fc);
+ }
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL(fc_mount);
+
+struct vfsmount *vfs_kern_mount(struct file_system_type *type,
+ int flags, const char *name,
+ void *data)
+{
+ struct fs_context *fc;
+ struct vfsmount *mnt;
+ int ret = 0;
+
+ if (!type)
+ return ERR_PTR(-EINVAL);
+
+ fc = fs_context_for_mount(type, flags);
+ if (IS_ERR(fc))
+ return ERR_CAST(fc);
+
+ if (name)
+ ret = vfs_parse_fs_string(fc, "source",
+ name, strlen(name));
+ if (!ret)
+ ret = parse_monolithic_mount_data(fc, data);
+ if (!ret)
+ mnt = fc_mount(fc);
+ else
+ mnt = ERR_PTR(ret);
+
+ put_fs_context(fc);
+ return mnt;
+}
EXPORT_SYMBOL_GPL(vfs_kern_mount);
struct vfsmount *
@@ -1013,27 +1057,6 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
mnt->mnt.mnt_flags = old->mnt.mnt_flags;
mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
- /* Don't allow unprivileged users to change mount flags */
- if (flag & CL_UNPRIVILEGED) {
- mnt->mnt.mnt_flags |= MNT_LOCK_ATIME;
-
- if (mnt->mnt.mnt_flags & MNT_READONLY)
- mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
-
- if (mnt->mnt.mnt_flags & MNT_NODEV)
- mnt->mnt.mnt_flags |= MNT_LOCK_NODEV;
-
- if (mnt->mnt.mnt_flags & MNT_NOSUID)
- mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID;
-
- if (mnt->mnt.mnt_flags & MNT_NOEXEC)
- mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC;
- }
-
- /* Don't allow unprivileged users to reveal what is under a mount */
- if ((flag & CL_UNPRIVILEGED) &&
- (!(flag & CL_EXPIRE) || list_empty(&old->mnt_expire)))
- mnt->mnt.mnt_flags |= MNT_LOCKED;
atomic_inc(&sb->s_active);
mnt->mnt.mnt_sb = sb;
@@ -1464,6 +1487,29 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
static void shrink_submounts(struct mount *mnt);
+static int do_umount_root(struct super_block *sb)
+{
+ int ret = 0;
+
+ down_write(&sb->s_umount);
+ if (!sb_rdonly(sb)) {
+ struct fs_context *fc;
+
+ fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY,
+ SB_RDONLY);
+ if (IS_ERR(fc)) {
+ ret = PTR_ERR(fc);
+ } else {
+ ret = parse_monolithic_mount_data(fc, NULL);
+ if (!ret)
+ ret = reconfigure_super(fc);
+ put_fs_context(fc);
+ }
+ }
+ up_write(&sb->s_umount);
+ return ret;
+}
+
static int do_umount(struct mount *mnt, int flags)
{
struct super_block *sb = mnt->mnt.mnt_sb;
@@ -1529,11 +1575,7 @@ static int do_umount(struct mount *mnt, int flags)
*/
if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
return -EPERM;
- down_write(&sb->s_umount);
- if (!sb_rdonly(sb))
- retval = do_remount_sb(sb, SB_RDONLY, NULL, 0);
- up_write(&sb->s_umount);
- return retval;
+ return do_umount_root(sb);
}
namespace_lock();
@@ -1839,6 +1881,33 @@ int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
return 0;
}
+static void lock_mnt_tree(struct mount *mnt)
+{
+ struct mount *p;
+
+ for (p = mnt; p; p = next_mnt(p, mnt)) {
+ int flags = p->mnt.mnt_flags;
+ /* Don't allow unprivileged users to change mount flags */
+ flags |= MNT_LOCK_ATIME;
+
+ if (flags & MNT_READONLY)
+ flags |= MNT_LOCK_READONLY;
+
+ if (flags & MNT_NODEV)
+ flags |= MNT_LOCK_NODEV;
+
+ if (flags & MNT_NOSUID)
+ flags |= MNT_LOCK_NOSUID;
+
+ if (flags & MNT_NOEXEC)
+ flags |= MNT_LOCK_NOEXEC;
+ /* Don't allow unprivileged users to reveal what is under a mount */
+ if (list_empty(&p->mnt_expire))
+ flags |= MNT_LOCKED;
+ p->mnt.mnt_flags = flags;
+ }
+}
+
static void cleanup_group_ids(struct mount *mnt, struct mount *end)
{
struct mount *p;
@@ -1956,6 +2025,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
struct mountpoint *dest_mp,
struct path *parent_path)
{
+ struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
HLIST_HEAD(tree_list);
struct mnt_namespace *ns = dest_mnt->mnt_ns;
struct mountpoint *smp;
@@ -2006,6 +2076,9 @@ static int attach_recursive_mnt(struct mount *source_mnt,
child->mnt_mountpoint);
if (q)
mnt_change_mountpoint(child, smp, q);
+ /* Notice when we are propagating across user namespaces */
+ if (child->mnt_parent->mnt_ns->user_ns != user_ns)
+ lock_mnt_tree(child);
commit_tree(child);
}
put_mountpoint(smp);
@@ -2313,7 +2386,7 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags,
int err;
struct super_block *sb = path->mnt->mnt_sb;
struct mount *mnt = real_mount(path->mnt);
- void *sec_opts = NULL;
+ struct fs_context *fc;
if (!check_mnt(mnt))
return -EINVAL;
@@ -2324,24 +2397,22 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags,
if (!can_change_locked_flags(mnt, mnt_flags))
return -EPERM;
- if (data && !(sb->s_type->fs_flags & FS_BINARY_MOUNTDATA)) {
- err = security_sb_eat_lsm_opts(data, &sec_opts);
- if (err)
- return err;
- }
- err = security_sb_remount(sb, sec_opts);
- security_free_mnt_opts(&sec_opts);
- if (err)
- return err;
+ fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK);
+ if (IS_ERR(fc))
+ return PTR_ERR(fc);
- down_write(&sb->s_umount);
- err = -EPERM;
- if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
- err = do_remount_sb(sb, sb_flags, data, 0);
- if (!err)
- set_mount_attributes(mnt, mnt_flags);
+ err = parse_monolithic_mount_data(fc, data);
+ if (!err) {
+ down_write(&sb->s_umount);
+ err = -EPERM;
+ if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
+ err = reconfigure_super(fc);
+ if (!err)
+ set_mount_attributes(mnt, mnt_flags);
+ }
+ up_write(&sb->s_umount);
}
- up_write(&sb->s_umount);
+ put_fs_context(fc);
return err;
}
@@ -2425,29 +2496,6 @@ out:
return err;
}
-static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
-{
- int err;
- const char *subtype = strchr(fstype, '.');
- if (subtype) {
- subtype++;
- err = -EINVAL;
- if (!subtype[0])
- goto err;
- } else
- subtype = "";
-
- mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
- err = -ENOMEM;
- if (!mnt->mnt_sb->s_subtype)
- goto err;
- return mnt;
-
- err:
- mntput(mnt);
- return ERR_PTR(err);
-}
-
/*
* add a mount into a namespace's mount tree
*/
@@ -2492,7 +2540,39 @@ unlock:
return err;
}
-static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags);
+static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags);
+
+/*
+ * Create a new mount using a superblock configuration and request it
+ * be added to the namespace tree.
+ */
+static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
+ unsigned int mnt_flags)
+{
+ struct vfsmount *mnt;
+ struct super_block *sb = fc->root->d_sb;
+ int error;
+
+ error = security_sb_kern_mount(sb);
+ if (!error && mount_too_revealing(sb, &mnt_flags))
+ error = -EPERM;
+
+ if (unlikely(error)) {
+ fc_drop_locked(fc);
+ return error;
+ }
+
+ up_write(&sb->s_umount);
+
+ mnt = vfs_create_mount(fc);
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
+
+ error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags);
+ if (error < 0)
+ mntput(mnt);
+ return error;
+}
/*
* create a new mount for userspace and request it to be added into the
@@ -2502,8 +2582,9 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
int mnt_flags, const char *name, void *data)
{
struct file_system_type *type;
- struct vfsmount *mnt;
- int err;
+ struct fs_context *fc;
+ const char *subtype = NULL;
+ int err = 0;
if (!fstype)
return -EINVAL;
@@ -2512,23 +2593,37 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
if (!type)
return -ENODEV;
- mnt = vfs_kern_mount(type, sb_flags, name, data);
- if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
- !mnt->mnt_sb->s_subtype)
- mnt = fs_set_subtype(mnt, fstype);
+ if (type->fs_flags & FS_HAS_SUBTYPE) {
+ subtype = strchr(fstype, '.');
+ if (subtype) {
+ subtype++;
+ if (!*subtype) {
+ put_filesystem(type);
+ return -EINVAL;
+ }
+ } else {
+ subtype = "";
+ }
+ }
+ fc = fs_context_for_mount(type, sb_flags);
put_filesystem(type);
- if (IS_ERR(mnt))
- return PTR_ERR(mnt);
-
- if (mount_too_revealing(mnt, &mnt_flags)) {
- mntput(mnt);
- return -EPERM;
- }
+ if (IS_ERR(fc))
+ return PTR_ERR(fc);
+
+ if (subtype)
+ err = vfs_parse_fs_string(fc, "subtype",
+ subtype, strlen(subtype));
+ if (!err && name)
+ err = vfs_parse_fs_string(fc, "source", name, strlen(name));
+ if (!err)
+ err = parse_monolithic_mount_data(fc, data);
+ if (!err)
+ err = vfs_get_tree(fc);
+ if (!err)
+ err = do_new_mount_fc(fc, path, mnt_flags);
- err = do_add_mount(real_mount(mnt), path, mnt_flags);
- if (err)
- mntput(mnt);
+ put_fs_context(fc);
return err;
}
@@ -2863,7 +2958,8 @@ static void dec_mnt_namespaces(struct ucounts *ucounts)
static void free_mnt_ns(struct mnt_namespace *ns)
{
- ns_free_inum(&ns->ns);
+ if (!is_anon_ns(ns))
+ ns_free_inum(&ns->ns);
dec_mnt_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
kfree(ns);
@@ -2878,7 +2974,7 @@ static void free_mnt_ns(struct mnt_namespace *ns)
*/
static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
-static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
+static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon)
{
struct mnt_namespace *new_ns;
struct ucounts *ucounts;
@@ -2888,28 +2984,27 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
if (!ucounts)
return ERR_PTR(-ENOSPC);
- new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
+ new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
if (!new_ns) {
dec_mnt_namespaces(ucounts);
return ERR_PTR(-ENOMEM);
}
- ret = ns_alloc_inum(&new_ns->ns);
- if (ret) {
- kfree(new_ns);
- dec_mnt_namespaces(ucounts);
- return ERR_PTR(ret);
+ if (!anon) {
+ ret = ns_alloc_inum(&new_ns->ns);
+ if (ret) {
+ kfree(new_ns);
+ dec_mnt_namespaces(ucounts);
+ return ERR_PTR(ret);
+ }
}
new_ns->ns.ops = &mntns_operations;
- new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
+ if (!anon)
+ new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
atomic_set(&new_ns->count, 1);
- new_ns->root = NULL;
INIT_LIST_HEAD(&new_ns->list);
init_waitqueue_head(&new_ns->poll);
- new_ns->event = 0;
new_ns->user_ns = get_user_ns(user_ns);
new_ns->ucounts = ucounts;
- new_ns->mounts = 0;
- new_ns->pending_mounts = 0;
return new_ns;
}
@@ -2933,7 +3028,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
old = ns->root;
- new_ns = alloc_mnt_ns(user_ns);
+ new_ns = alloc_mnt_ns(user_ns, false);
if (IS_ERR(new_ns))
return new_ns;
@@ -2941,13 +3036,18 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
/* First pass: copy the tree topology */
copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
if (user_ns != ns->user_ns)
- copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;
+ copy_flags |= CL_SHARED_TO_SLAVE;
new = copy_tree(old, old->mnt.mnt_root, copy_flags);
if (IS_ERR(new)) {
namespace_unlock();
free_mnt_ns(new_ns);
return ERR_CAST(new);
}
+ if (user_ns != ns->user_ns) {
+ lock_mount_hash();
+ lock_mnt_tree(new);
+ unlock_mount_hash();
+ }
new_ns->root = new;
list_add_tail(&new_ns->list, &new->mnt_list);
@@ -2988,37 +3088,25 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
return new_ns;
}
-/**
- * create_mnt_ns - creates a private namespace and adds a root filesystem
- * @mnt: pointer to the new root filesystem mountpoint
- */
-static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)
-{
- struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns);
- if (!IS_ERR(new_ns)) {
- struct mount *mnt = real_mount(m);
- mnt->mnt_ns = new_ns;
- new_ns->root = mnt;
- new_ns->mounts++;
- list_add(&mnt->mnt_list, &new_ns->list);
- } else {
- mntput(m);
- }
- return new_ns;
-}
-
-struct dentry *mount_subtree(struct vfsmount *mnt, const char *name)
+struct dentry *mount_subtree(struct vfsmount *m, const char *name)
{
+ struct mount *mnt = real_mount(m);
struct mnt_namespace *ns;
struct super_block *s;
struct path path;
int err;
- ns = create_mnt_ns(mnt);
- if (IS_ERR(ns))
+ ns = alloc_mnt_ns(&init_user_ns, true);
+ if (IS_ERR(ns)) {
+ mntput(m);
return ERR_CAST(ns);
+ }
+ mnt->mnt_ns = ns;
+ ns->root = mnt;
+ ns->mounts++;
+ list_add(&mnt->mnt_list, &ns->list);
- err = vfs_path_lookup(mnt->mnt_root, mnt,
+ err = vfs_path_lookup(m->mnt_root, m,
name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
put_mnt_ns(ns);
@@ -3228,6 +3316,7 @@ out0:
static void __init init_mount_tree(void)
{
struct vfsmount *mnt;
+ struct mount *m;
struct mnt_namespace *ns;
struct path root;
struct file_system_type *type;
@@ -3240,10 +3329,14 @@ static void __init init_mount_tree(void)
if (IS_ERR(mnt))
panic("Can't create rootfs");
- ns = create_mnt_ns(mnt);
+ ns = alloc_mnt_ns(&init_user_ns, false);
if (IS_ERR(ns))
panic("Can't allocate initial namespace");
-
+ m = real_mount(mnt);
+ m->mnt_ns = ns;
+ ns->root = m;
+ ns->mounts = 1;
+ list_add(&m->mnt_list, &ns->list);
init_task.nsproxy->mnt_ns = ns;
get_mnt_ns(ns);
@@ -3297,10 +3390,10 @@ void put_mnt_ns(struct mnt_namespace *ns)
free_mnt_ns(ns);
}
-struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
+struct vfsmount *kern_mount(struct file_system_type *type)
{
struct vfsmount *mnt;
- mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, data);
+ mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
if (!IS_ERR(mnt)) {
/*
* it is a longterm mount, don't release mnt until
@@ -3310,7 +3403,7 @@ struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
}
return mnt;
}
-EXPORT_SYMBOL_GPL(kern_mount_data);
+EXPORT_SYMBOL_GPL(kern_mount);
void kern_unmount(struct vfsmount *mnt)
{
@@ -3352,7 +3445,8 @@ bool current_chrooted(void)
return chrooted;
}
-static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new,
+static bool mnt_already_visible(struct mnt_namespace *ns,
+ const struct super_block *sb,
int *new_mnt_flags)
{
int new_flags = *new_mnt_flags;
@@ -3364,7 +3458,7 @@ static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new,
struct mount *child;
int mnt_flags;
- if (mnt->mnt.mnt_sb->s_type != new->mnt_sb->s_type)
+ if (mnt->mnt.mnt_sb->s_type != sb->s_type)
continue;
/* This mount is not fully visible if it's root directory
@@ -3415,7 +3509,7 @@ found:
return visible;
}
-static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags)
+static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags)
{
const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV;
struct mnt_namespace *ns = current->nsproxy->mnt_ns;
@@ -3425,7 +3519,7 @@ static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags)
return false;
/* Can this filesystem be too revealing? */
- s_iflags = mnt->mnt_sb->s_iflags;
+ s_iflags = sb->s_iflags;
if (!(s_iflags & SB_I_USERNS_VISIBLE))
return false;
@@ -3435,7 +3529,7 @@ static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags)
return true;
}
- return !mnt_already_visible(ns, mnt, new_mnt_flags);
+ return !mnt_already_visible(ns, sb, new_mnt_flags);
}
bool mnt_may_suid(struct vfsmount *mnt)
@@ -3484,6 +3578,9 @@ static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns)
!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
return -EPERM;
+ if (is_anon_ns(mnt_ns))
+ return -EINVAL;
+
if (fs->users != 1)
return -EINVAL;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index a87a56273407..06233bfa6d73 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -72,16 +72,6 @@ static int nfs4_encode_void(struct svc_rqst *rqstp, __be32 *p)
return xdr_ressize_check(rqstp, p);
}
-static __be32 *read_buf(struct xdr_stream *xdr, size_t nbytes)
-{
- __be32 *p;
-
- p = xdr_inline_decode(xdr, nbytes);
- if (unlikely(p == NULL))
- printk(KERN_WARNING "NFS: NFSv4 callback reply buffer overflowed!\n");
- return p;
-}
-
static __be32 decode_string(struct xdr_stream *xdr, unsigned int *len,
const char **str, size_t maxlen)
{
@@ -98,13 +88,13 @@ static __be32 decode_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
{
__be32 *p;
- p = read_buf(xdr, 4);
+ p = xdr_inline_decode(xdr, 4);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
fh->size = ntohl(*p);
if (fh->size > NFS4_FHSIZE)
return htonl(NFS4ERR_BADHANDLE);
- p = read_buf(xdr, fh->size);
+ p = xdr_inline_decode(xdr, fh->size);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
memcpy(&fh->data[0], p, fh->size);
@@ -117,11 +107,11 @@ static __be32 decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
__be32 *p;
unsigned int attrlen;
- p = read_buf(xdr, 4);
+ p = xdr_inline_decode(xdr, 4);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
attrlen = ntohl(*p);
- p = read_buf(xdr, attrlen << 2);
+ p = xdr_inline_decode(xdr, attrlen << 2);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
if (likely(attrlen > 0))
@@ -135,7 +125,7 @@ static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
{
__be32 *p;
- p = read_buf(xdr, NFS4_STATEID_SIZE);
+ p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
memcpy(stateid->data, p, NFS4_STATEID_SIZE);
@@ -156,7 +146,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
status = decode_string(xdr, &hdr->taglen, &hdr->tag, CB_OP_TAGLEN_MAXSZ);
if (unlikely(status != 0))
return status;
- p = read_buf(xdr, 12);
+ p = xdr_inline_decode(xdr, 12);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
hdr->minorversion = ntohl(*p++);
@@ -176,7 +166,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
static __be32 decode_op_hdr(struct xdr_stream *xdr, unsigned int *op)
{
__be32 *p;
- p = read_buf(xdr, 4);
+ p = xdr_inline_decode(xdr, 4);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE_HDR);
*op = ntohl(*p);
@@ -205,7 +195,7 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp,
status = decode_delegation_stateid(xdr, &args->stateid);
if (unlikely(status != 0))
return status;
- p = read_buf(xdr, 4);
+ p = xdr_inline_decode(xdr, 4);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
args->truncate = ntohl(*p);
@@ -227,7 +217,7 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
__be32 status = 0;
uint32_t iomode;
- p = read_buf(xdr, 4 * sizeof(uint32_t));
+ p = xdr_inline_decode(xdr, 4 * sizeof(uint32_t));
if (unlikely(p == NULL))
return htonl(NFS4ERR_BADXDR);
@@ -245,14 +235,14 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
if (unlikely(status != 0))
return status;
- p = read_buf(xdr, 2 * sizeof(uint64_t));
+ p = xdr_inline_decode(xdr, 2 * sizeof(uint64_t));
if (unlikely(p == NULL))
return htonl(NFS4ERR_BADXDR);
p = xdr_decode_hyper(p, &args->cbl_range.offset);
p = xdr_decode_hyper(p, &args->cbl_range.length);
return decode_layout_stateid(xdr, &args->cbl_stateid);
} else if (args->cbl_recall_type == RETURN_FSID) {
- p = read_buf(xdr, 2 * sizeof(uint64_t));
+ p = xdr_inline_decode(xdr, 2 * sizeof(uint64_t));
if (unlikely(p == NULL))
return htonl(NFS4ERR_BADXDR);
p = xdr_decode_hyper(p, &args->cbl_fsid.major);
@@ -275,7 +265,7 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp,
args->ndevs = 0;
/* Num of device notifications */
- p = read_buf(xdr, sizeof(uint32_t));
+ p = xdr_inline_decode(xdr, sizeof(uint32_t));
if (unlikely(p == NULL)) {
status = htonl(NFS4ERR_BADXDR);
goto out;
@@ -298,7 +288,8 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp,
for (i = 0; i < n; i++) {
struct cb_devicenotifyitem *dev = &args->devs[i];
- p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE);
+ p = xdr_inline_decode(xdr, (4 * sizeof(uint32_t)) +
+ NFS4_DEVICEID4_SIZE);
if (unlikely(p == NULL)) {
status = htonl(NFS4ERR_BADXDR);
goto err;
@@ -329,7 +320,7 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp,
p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) {
- p = read_buf(xdr, sizeof(uint32_t));
+ p = xdr_inline_decode(xdr, sizeof(uint32_t));
if (unlikely(p == NULL)) {
status = htonl(NFS4ERR_BADXDR);
goto err;
@@ -359,7 +350,7 @@ static __be32 decode_sessionid(struct xdr_stream *xdr,
{
__be32 *p;
- p = read_buf(xdr, NFS4_MAX_SESSIONID_LEN);
+ p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
@@ -379,13 +370,13 @@ static __be32 decode_rc_list(struct xdr_stream *xdr,
goto out;
status = htonl(NFS4ERR_RESOURCE);
- p = read_buf(xdr, sizeof(uint32_t));
+ p = xdr_inline_decode(xdr, sizeof(uint32_t));
if (unlikely(p == NULL))
goto out;
rc_list->rcl_nrefcalls = ntohl(*p++);
if (rc_list->rcl_nrefcalls) {
- p = read_buf(xdr,
+ p = xdr_inline_decode(xdr,
rc_list->rcl_nrefcalls * 2 * sizeof(uint32_t));
if (unlikely(p == NULL))
goto out;
@@ -418,7 +409,7 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp,
if (status)
return status;
- p = read_buf(xdr, 5 * sizeof(uint32_t));
+ p = xdr_inline_decode(xdr, 5 * sizeof(uint32_t));
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
@@ -461,7 +452,7 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp,
uint32_t bitmap[2];
__be32 *p, status;
- p = read_buf(xdr, 4);
+ p = xdr_inline_decode(xdr, 4);
if (unlikely(p == NULL))
return htonl(NFS4ERR_BADXDR);
args->craa_objs_to_keep = ntohl(*p++);
@@ -480,7 +471,7 @@ static __be32 decode_recallslot_args(struct svc_rqst *rqstp,
struct cb_recallslotargs *args = argp;
__be32 *p;
- p = read_buf(xdr, 4);
+ p = xdr_inline_decode(xdr, 4);
if (unlikely(p == NULL))
return htonl(NFS4ERR_BADXDR);
args->crsa_target_highest_slotid = ntohl(*p++);
@@ -492,14 +483,14 @@ static __be32 decode_lockowner(struct xdr_stream *xdr, struct cb_notify_lock_arg
__be32 *p;
unsigned int len;
- p = read_buf(xdr, 12);
+ p = xdr_inline_decode(xdr, 12);
if (unlikely(p == NULL))
return htonl(NFS4ERR_BADXDR);
p = xdr_decode_hyper(p, &args->cbnl_owner.clientid);
len = be32_to_cpu(*p);
- p = read_buf(xdr, len);
+ p = xdr_inline_decode(xdr, len);
if (unlikely(p == NULL))
return htonl(NFS4ERR_BADXDR);
@@ -537,7 +528,7 @@ static __be32 decode_write_response(struct xdr_stream *xdr,
__be32 *p;
/* skip the always zero field */
- p = read_buf(xdr, 4);
+ p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
goto out;
p++;
@@ -577,7 +568,7 @@ static __be32 decode_offload_args(struct svc_rqst *rqstp,
return status;
/* decode status */
- p = read_buf(xdr, 4);
+ p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
goto out;
args->error = ntohl(*p++);
@@ -943,10 +934,11 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp)
};
unsigned int nops = 0;
- xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base);
+ xdr_init_decode(&xdr_in, &rqstp->rq_arg,
+ rqstp->rq_arg.head[0].iov_base, NULL);
p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len);
- xdr_init_encode(&xdr_out, &rqstp->rq_res, p);
+ xdr_init_encode(&xdr_out, &rqstp->rq_res, p, NULL);
status = decode_compound_hdr_arg(&xdr_in, &hdr_arg);
if (status == htonl(NFS4ERR_RESOURCE))
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 885363ca8569..2f6b447cdd82 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -229,6 +229,8 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation
spin_lock(&delegation->lock);
if (delegation->inode != NULL)
inode = igrab(delegation->inode);
+ if (!inode)
+ set_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags);
spin_unlock(&delegation->lock);
return inode;
}
@@ -681,7 +683,7 @@ void nfs_expire_all_delegations(struct nfs_client *clp)
/**
* nfs_super_return_all_delegations - return delegations for one superblock
- * @sb: sb to process
+ * @server: pointer to nfs_server to process
*
*/
void nfs_server_return_all_delegations(struct nfs_server *server)
@@ -944,10 +946,11 @@ restart:
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
list_for_each_entry_rcu(delegation, &server->delegations,
super_list) {
- if (test_bit(NFS_DELEGATION_RETURNING,
- &delegation->flags))
- continue;
- if (test_bit(NFS_DELEGATION_NEED_RECLAIM,
+ if (test_bit(NFS_DELEGATION_INODE_FREEING,
+ &delegation->flags) ||
+ test_bit(NFS_DELEGATION_RETURNING,
+ &delegation->flags) ||
+ test_bit(NFS_DELEGATION_NEED_RECLAIM,
&delegation->flags) == 0)
continue;
if (!nfs_sb_active(server->super))
@@ -1053,10 +1056,11 @@ restart:
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
list_for_each_entry_rcu(delegation, &server->delegations,
super_list) {
- if (test_bit(NFS_DELEGATION_RETURNING,
- &delegation->flags))
- continue;
- if (test_bit(NFS_DELEGATION_TEST_EXPIRED,
+ if (test_bit(NFS_DELEGATION_INODE_FREEING,
+ &delegation->flags) ||
+ test_bit(NFS_DELEGATION_RETURNING,
+ &delegation->flags) ||
+ test_bit(NFS_DELEGATION_TEST_EXPIRED,
&delegation->flags) == 0)
continue;
if (!nfs_sb_active(server->super))
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index dcbf3394ba0e..35b4b02c1ae0 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -34,6 +34,7 @@ enum {
NFS_DELEGATION_RETURNING,
NFS_DELEGATION_REVOKED,
NFS_DELEGATION_TEST_EXPIRED,
+ NFS_DELEGATION_INODE_FREEING,
};
int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred,
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 6bf4471850c8..a71d0b42d160 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -139,12 +139,19 @@ struct nfs_cache_array {
struct nfs_cache_array_entry array[0];
};
+struct readdirvec {
+ unsigned long nr;
+ unsigned long index;
+ struct page *pages[NFS_MAX_READDIR_RAPAGES];
+};
+
typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, bool);
typedef struct {
struct file *file;
struct page *page;
struct dir_context *ctx;
unsigned long page_index;
+ struct readdirvec pvec;
u64 *dir_cookie;
u64 last_cookie;
loff_t current_index;
@@ -524,6 +531,10 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
struct nfs_cache_array *array;
unsigned int count = 0;
int status;
+ int max_rapages = NFS_MAX_READDIR_RAPAGES;
+
+ desc->pvec.index = desc->page_index;
+ desc->pvec.nr = 0;
scratch = alloc_page(GFP_KERNEL);
if (scratch == NULL)
@@ -548,20 +559,40 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
if (desc->plus)
nfs_prime_dcache(file_dentry(desc->file), entry);
- status = nfs_readdir_add_to_array(entry, page);
+ status = nfs_readdir_add_to_array(entry, desc->pvec.pages[desc->pvec.nr]);
+ if (status == -ENOSPC) {
+ desc->pvec.nr++;
+ if (desc->pvec.nr == max_rapages)
+ break;
+ status = nfs_readdir_add_to_array(entry, desc->pvec.pages[desc->pvec.nr]);
+ }
if (status != 0)
break;
} while (!entry->eof);
+ /*
+ * page and desc->pvec.pages[0] are valid, don't need to check
+ * whether or not to be NULL.
+ */
+ copy_highpage(page, desc->pvec.pages[0]);
+
out_nopages:
if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) {
- array = kmap(page);
+ array = kmap_atomic(desc->pvec.pages[desc->pvec.nr]);
array->eof_index = array->size;
status = 0;
- kunmap(page);
+ kunmap_atomic(array);
}
put_page(scratch);
+
+ /*
+ * desc->pvec.nr > 0 means at least one page was completely filled,
+ * we should return -ENOSPC. Otherwise function
+ * nfs_readdir_xdr_to_array will enter infinite loop.
+ */
+ if (desc->pvec.nr > 0)
+ return -ENOSPC;
return status;
}
@@ -574,8 +605,8 @@ void nfs_readdir_free_pages(struct page **pages, unsigned int npages)
}
/*
- * nfs_readdir_large_page will allocate pages that must be freed with a call
- * to nfs_readdir_free_pagearray
+ * nfs_readdir_alloc_pages() will allocate pages that must be freed with a call
+ * to nfs_readdir_free_pages()
*/
static
int nfs_readdir_alloc_pages(struct page **pages, unsigned int npages)
@@ -595,6 +626,24 @@ out_freepages:
return -ENOMEM;
}
+/*
+ * nfs_readdir_rapages_init initialize rapages by nfs_cache_array structure.
+ */
+static
+void nfs_readdir_rapages_init(nfs_readdir_descriptor_t *desc)
+{
+ struct nfs_cache_array *array;
+ int max_rapages = NFS_MAX_READDIR_RAPAGES;
+ int index;
+
+ for (index = 0; index < max_rapages; index++) {
+ array = kmap_atomic(desc->pvec.pages[index]);
+ memset(array, 0, sizeof(struct nfs_cache_array));
+ array->eof_index = -1;
+ kunmap_atomic(array);
+ }
+}
+
static
int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
{
@@ -605,6 +654,12 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
int status = -ENOMEM;
unsigned int array_size = ARRAY_SIZE(pages);
+ /*
+ * This means we hit readdir rdpages miss, the preallocated rdpages
+ * are useless, the preallocate rdpages should be reinitialized.
+ */
+ nfs_readdir_rapages_init(desc);
+
entry.prev_cookie = 0;
entry.cookie = desc->last_cookie;
entry.eof = 0;
@@ -664,9 +719,24 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
struct inode *inode = file_inode(desc->file);
int ret;
- ret = nfs_readdir_xdr_to_array(desc, page, inode);
- if (ret < 0)
- goto error;
+ /*
+ * If desc->page_index in range desc->pvec.index and
+ * desc->pvec.index + desc->pvec.nr, we get readdir cache hit.
+ */
+ if (desc->page_index >= desc->pvec.index &&
+ desc->page_index < (desc->pvec.index + desc->pvec.nr)) {
+ /*
+ * page and desc->pvec.pages[x] are valid, don't need to check
+ * whether or not to be NULL.
+ */
+ copy_highpage(page, desc->pvec.pages[desc->page_index - desc->pvec.index]);
+ ret = 0;
+ } else {
+ ret = nfs_readdir_xdr_to_array(desc, page, inode);
+ if (ret < 0)
+ goto error;
+ }
+
SetPageUptodate(page);
if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
@@ -831,6 +901,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
*desc = &my_desc;
struct nfs_open_dir_context *dir_ctx = file->private_data;
int res = 0;
+ int max_rapages = NFS_MAX_READDIR_RAPAGES;
dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n",
file, (long long)ctx->pos);
@@ -850,6 +921,12 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
desc->decode = NFS_PROTO(inode)->decode_dirent;
desc->plus = nfs_use_readdirplus(inode, ctx);
+ res = nfs_readdir_alloc_pages(desc->pvec.pages, max_rapages);
+ if (res < 0)
+ return -ENOMEM;
+
+ nfs_readdir_rapages_init(desc);
+
if (ctx->pos == 0 || nfs_attribute_cache_expired(inode))
res = nfs_revalidate_mapping(inode, file->f_mapping);
if (res < 0)
@@ -885,6 +962,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
break;
} while (!desc->eof);
out:
+ nfs_readdir_free_pages(desc->pvec.pages, max_rapages);
if (res > 0)
res = 0;
dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res);
@@ -945,7 +1023,7 @@ static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end,
/**
* nfs_force_lookup_revalidate - Mark the directory as having changed
- * @dir - pointer to directory inode
+ * @dir: pointer to directory inode
*
* This forces the revalidation code in nfs_lookup_revalidate() to do a
* full lookup on all child dentries of 'dir' whenever a change occurs
@@ -1649,7 +1727,7 @@ nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
reval_dentry:
if (flags & LOOKUP_RCU)
return -ECHILD;
- return nfs_lookup_revalidate_dentry(dir, dentry, inode);;
+ return nfs_lookup_revalidate_dentry(dir, dentry, inode);
full_reval:
return nfs_do_lookup_revalidate(dir, dentry, flags);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 33824a0a57bf..0fd811ac08b5 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -428,7 +428,7 @@ out_put:
hdr->release(hdr);
}
-static void nfs_read_sync_pgio_error(struct list_head *head)
+static void nfs_read_sync_pgio_error(struct list_head *head, int error)
{
struct nfs_page *req;
@@ -664,8 +664,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
if (!nfs_pageio_add_request(&desc, req)) {
- nfs_list_remove_request(req);
- nfs_list_add_request(req, &failed);
+ nfs_list_move_request(req, &failed);
spin_lock(&cinfo.inode->i_lock);
dreq->flags = 0;
if (desc.pg_error < 0)
@@ -821,7 +820,7 @@ out_put:
hdr->release(hdr);
}
-static void nfs_write_sync_pgio_error(struct list_head *head)
+static void nfs_write_sync_pgio_error(struct list_head *head, int error)
{
struct nfs_page *req;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 29553fdba8af..4899b85f9b3c 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -89,8 +89,8 @@ EXPORT_SYMBOL_GPL(nfs_file_release);
/**
* nfs_revalidate_size - Revalidate the file size
- * @inode - pointer to inode struct
- * @file - pointer to struct file
+ * @inode: pointer to inode struct
+ * @filp: pointer to struct file
*
* Revalidates the file length. This is basically a wrapper around
* nfs_revalidate_inode() that takes into account the fact that we may
@@ -276,6 +276,12 @@ EXPORT_SYMBOL_GPL(nfs_file_fsync);
* then a modify/write/read cycle when writing to a page in the
* page cache.
*
+ * Some pNFS layout drivers can only read/write at a certain block
+ * granularity like all block devices and therefore we must perform
+ * read/modify/write whenever a page hasn't read yet and the data
+ * to be written there is not aligned to a block boundary and/or
+ * smaller than the block size.
+ *
* The modify/write/read cycle may occur if a page is read before
* being completely filled by the writer. In this situation, the
* page must be completely written to stable storage on the server
@@ -291,26 +297,32 @@ EXPORT_SYMBOL_GPL(nfs_file_fsync);
* and that the new data won't completely replace the old data in
* that range of the file.
*/
-static int nfs_want_read_modify_write(struct file *file, struct page *page,
- loff_t pos, unsigned len)
+static bool nfs_full_page_write(struct page *page, loff_t pos, unsigned int len)
{
unsigned int pglen = nfs_page_length(page);
unsigned int offset = pos & (PAGE_SIZE - 1);
unsigned int end = offset + len;
- if (pnfs_ld_read_whole_page(file->f_mapping->host)) {
- if (!PageUptodate(page))
- return 1;
- return 0;
- }
+ return !pglen || (end >= pglen && !offset);
+}
- if ((file->f_mode & FMODE_READ) && /* open for read? */
- !PageUptodate(page) && /* Uptodate? */
- !PagePrivate(page) && /* i/o request already? */
- pglen && /* valid bytes of file? */
- (end < pglen || offset)) /* replace all valid bytes? */
- return 1;
- return 0;
+static bool nfs_want_read_modify_write(struct file *file, struct page *page,
+ loff_t pos, unsigned int len)
+{
+ /*
+ * Up-to-date pages, those with ongoing or full-page write
+ * don't need read/modify/write
+ */
+ if (PageUptodate(page) || PagePrivate(page) ||
+ nfs_full_page_write(page, pos, len))
+ return false;
+
+ if (pnfs_ld_read_whole_page(file->f_mapping->host))
+ return true;
+ /* Open for reading too? */
+ if (file->f_mode & FMODE_READ)
+ return true;
+ return false;
}
/*
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 63abe705f4ca..f9264e1922a2 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -410,7 +410,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
for (i = 0; i < fls->mirror_array_cnt; i++) {
struct nfs4_ff_layout_mirror *mirror;
struct cred *kcred;
- const struct cred *cred;
+ const struct cred __rcu *cred;
kuid_t uid;
kgid_t gid;
u32 ds_count, fh_count, id;
@@ -501,7 +501,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
goto out_err_free;
kcred->fsuid = uid;
kcred->fsgid = gid;
- cred = kcred;
+ cred = RCU_INITIALIZER(kcred);
if (lgr->range.iomode == IOMODE_READ)
rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
@@ -788,30 +788,82 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
}
}
+static void
+ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, int idx)
+{
+ struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+
+ if (devid)
+ nfs4_mark_deviceid_unavailable(devid);
+}
+
+static void
+ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, int idx)
+{
+ struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+
+ if (devid)
+ nfs4_mark_deviceid_available(devid);
+}
+
static struct nfs4_pnfs_ds *
-ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
- int start_idx,
- int *best_idx)
+ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
+ int start_idx, int *best_idx,
+ bool check_device)
{
struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
+ struct nfs4_ff_layout_mirror *mirror;
struct nfs4_pnfs_ds *ds;
bool fail_return = false;
int idx;
- /* mirrors are sorted by efficiency */
+ /* mirrors are initially sorted by efficiency */
for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) {
if (idx+1 == fls->mirror_array_cnt)
- fail_return = true;
- ds = nfs4_ff_layout_prepare_ds(lseg, idx, fail_return);
- if (ds) {
- *best_idx = idx;
- return ds;
- }
+ fail_return = !check_device;
+
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ ds = nfs4_ff_layout_prepare_ds(lseg, mirror, fail_return);
+ if (!ds)
+ continue;
+
+ if (check_device &&
+ nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node))
+ continue;
+
+ *best_idx = idx;
+ return ds;
}
return NULL;
}
+static struct nfs4_pnfs_ds *
+ff_layout_choose_any_ds_for_read(struct pnfs_layout_segment *lseg,
+ int start_idx, int *best_idx)
+{
+ return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, false);
+}
+
+static struct nfs4_pnfs_ds *
+ff_layout_choose_valid_ds_for_read(struct pnfs_layout_segment *lseg,
+ int start_idx, int *best_idx)
+{
+ return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, true);
+}
+
+static struct nfs4_pnfs_ds *
+ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
+ int start_idx, int *best_idx)
+{
+ struct nfs4_pnfs_ds *ds;
+
+ ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx);
+ if (ds)
+ return ds;
+ return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx);
+}
+
static void
ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req,
@@ -925,7 +977,8 @@ retry:
goto out_mds;
for (i = 0; i < pgio->pg_mirror_count; i++) {
- ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true);
+ mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
+ ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, mirror, true);
if (!ds) {
if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
goto out_mds;
@@ -936,7 +989,6 @@ retry:
goto retry;
}
pgm = &pgio->pg_mirrors[i];
- mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize;
}
@@ -1071,6 +1123,8 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
break;
case -NFS4ERR_RETRY_UNCACHED_REP:
break;
+ case -EAGAIN:
+ return -NFS4ERR_RESET_TO_PNFS;
/* Invalidate Layout errors */
case -NFS4ERR_PNFS_NO_LAYOUT:
case -ESTALE: /* mapped NFS4ERR_STALE */
@@ -1131,6 +1185,7 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
case -EBADHANDLE:
case -ELOOP:
case -ENOSPC:
+ case -EAGAIN:
break;
case -EJUKEBOX:
nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
@@ -1158,8 +1213,10 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
{
int vers = clp->cl_nfs_mod->rpc_vers->number;
- if (task->tk_status >= 0)
+ if (task->tk_status >= 0) {
+ ff_layout_mark_ds_reachable(lseg, idx);
return 0;
+ }
/* Handle the case of an invalid layout segment */
if (!pnfs_is_valid_lseg(lseg))
@@ -1222,6 +1279,8 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
mirror, offset, length, status, opnum,
GFP_NOIO);
+ if (status == NFS4ERR_NXIO)
+ ff_layout_mark_ds_unreachable(lseg, idx);
pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg);
dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
}
@@ -1249,7 +1308,7 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
if (ff_layout_choose_best_ds_for_read(hdr->lseg,
hdr->pgio_mirror_idx + 1,
&hdr->pgio_mirror_idx))
- goto out_eagain;
+ goto out_layouterror;
set_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags);
return task->tk_status;
case -NFS4ERR_RESET_TO_MDS:
@@ -1260,6 +1319,8 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
}
return 0;
+out_layouterror:
+ ff_layout_send_layouterror(hdr->lseg);
out_eagain:
rpc_restart_call_prepare(task);
return -EAGAIN;
@@ -1293,15 +1354,6 @@ ff_layout_set_layoutcommit(struct inode *inode,
(unsigned long long) NFS_I(inode)->layout->plh_lwb);
}
-static bool
-ff_layout_device_unavailable(struct pnfs_layout_segment *lseg, int idx)
-{
- /* No mirroring for now */
- struct nfs4_deviceid_node *node = FF_LAYOUT_DEVID_NODE(lseg, idx);
-
- return ff_layout_test_devid_unavailable(node);
-}
-
static void ff_layout_read_record_layoutstats_start(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
@@ -1332,10 +1384,6 @@ static int ff_layout_read_prepare_common(struct rpc_task *task,
rpc_exit(task, -EIO);
return -EIO;
}
- if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) {
- rpc_exit(task, -EHOSTDOWN);
- return -EAGAIN;
- }
ff_layout_read_record_layoutstats_start(task, hdr);
return 0;
@@ -1369,6 +1417,16 @@ static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data)
ff_layout_read_prepare_common(task, hdr);
}
+static void
+ff_layout_io_prepare_transmit(struct rpc_task *task,
+ void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ if (!pnfs_is_valid_lseg(hdr->lseg))
+ rpc_exit(task, -EAGAIN);
+}
+
static void ff_layout_read_call_done(struct rpc_task *task, void *data)
{
struct nfs_pgio_header *hdr = data;
@@ -1399,9 +1457,10 @@ static void ff_layout_read_release(void *data)
struct nfs_pgio_header *hdr = data;
ff_layout_read_record_layoutstats_done(&hdr->task, hdr);
- if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags))
+ if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) {
+ ff_layout_send_layouterror(hdr->lseg);
pnfs_read_resend_pnfs(hdr);
- else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags))
+ } else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags))
ff_layout_reset_read(hdr);
pnfs_generic_rw_release(data);
}
@@ -1513,11 +1572,6 @@ static int ff_layout_write_prepare_common(struct rpc_task *task,
return -EIO;
}
- if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) {
- rpc_exit(task, -EHOSTDOWN);
- return -EAGAIN;
- }
-
ff_layout_write_record_layoutstats_start(task, hdr);
return 0;
}
@@ -1573,9 +1627,10 @@ static void ff_layout_write_release(void *data)
struct nfs_pgio_header *hdr = data;
ff_layout_write_record_layoutstats_done(&hdr->task, hdr);
- if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags))
+ if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) {
+ ff_layout_send_layouterror(hdr->lseg);
ff_layout_reset_write(hdr, true);
- else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags))
+ } else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags))
ff_layout_reset_write(hdr, false);
pnfs_generic_rw_release(data);
}
@@ -1657,6 +1712,7 @@ static void ff_layout_commit_release(void *data)
static const struct rpc_call_ops ff_layout_read_call_ops_v3 = {
.rpc_call_prepare = ff_layout_read_prepare_v3,
+ .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit,
.rpc_call_done = ff_layout_read_call_done,
.rpc_count_stats = ff_layout_read_count_stats,
.rpc_release = ff_layout_read_release,
@@ -1664,6 +1720,7 @@ static const struct rpc_call_ops ff_layout_read_call_ops_v3 = {
static const struct rpc_call_ops ff_layout_read_call_ops_v4 = {
.rpc_call_prepare = ff_layout_read_prepare_v4,
+ .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit,
.rpc_call_done = ff_layout_read_call_done,
.rpc_count_stats = ff_layout_read_count_stats,
.rpc_release = ff_layout_read_release,
@@ -1671,6 +1728,7 @@ static const struct rpc_call_ops ff_layout_read_call_ops_v4 = {
static const struct rpc_call_ops ff_layout_write_call_ops_v3 = {
.rpc_call_prepare = ff_layout_write_prepare_v3,
+ .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit,
.rpc_call_done = ff_layout_write_call_done,
.rpc_count_stats = ff_layout_write_count_stats,
.rpc_release = ff_layout_write_release,
@@ -1678,6 +1736,7 @@ static const struct rpc_call_ops ff_layout_write_call_ops_v3 = {
static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
.rpc_call_prepare = ff_layout_write_prepare_v4,
+ .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit,
.rpc_call_done = ff_layout_write_call_done,
.rpc_count_stats = ff_layout_write_count_stats,
.rpc_release = ff_layout_write_release,
@@ -1703,6 +1762,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
struct pnfs_layout_segment *lseg = hdr->lseg;
struct nfs4_pnfs_ds *ds;
struct rpc_clnt *ds_clnt;
+ struct nfs4_ff_layout_mirror *mirror;
const struct cred *ds_cred;
loff_t offset = hdr->args.offset;
u32 idx = hdr->pgio_mirror_idx;
@@ -1713,20 +1773,21 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
__func__, hdr->inode->i_ino,
hdr->args.pgbase, (size_t)hdr->args.count, offset);
- ds = nfs4_ff_layout_prepare_ds(lseg, idx, false);
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false);
if (!ds)
goto out_failed;
- ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
+ ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
hdr->inode);
if (IS_ERR(ds_clnt))
goto out_failed;
- ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
+ ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred);
if (!ds_cred)
goto out_failed;
- vers = nfs4_ff_layout_ds_version(lseg, idx);
+ vers = nfs4_ff_layout_ds_version(mirror);
dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count), vers);
@@ -1734,13 +1795,11 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
hdr->pgio_done_cb = ff_layout_read_done_cb;
refcount_inc(&ds->ds_clp->cl_count);
hdr->ds_clp = ds->ds_clp;
- fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
+ fh = nfs4_ff_layout_select_ds_fh(mirror);
if (fh)
hdr->args.fh = fh;
- if (vers == 4 &&
- !nfs4_ff_layout_select_ds_stateid(lseg, idx, &hdr->args.stateid))
- goto out_failed;
+ nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid);
/*
* Note that if we ever decide to split across DSes,
@@ -1770,26 +1829,28 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
struct pnfs_layout_segment *lseg = hdr->lseg;
struct nfs4_pnfs_ds *ds;
struct rpc_clnt *ds_clnt;
+ struct nfs4_ff_layout_mirror *mirror;
const struct cred *ds_cred;
loff_t offset = hdr->args.offset;
int vers;
struct nfs_fh *fh;
int idx = hdr->pgio_mirror_idx;
- ds = nfs4_ff_layout_prepare_ds(lseg, idx, true);
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true);
if (!ds)
goto out_failed;
- ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
+ ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
hdr->inode);
if (IS_ERR(ds_clnt))
goto out_failed;
- ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
+ ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred);
if (!ds_cred)
goto out_failed;
- vers = nfs4_ff_layout_ds_version(lseg, idx);
+ vers = nfs4_ff_layout_ds_version(mirror);
dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n",
__func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
@@ -1800,13 +1861,11 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
refcount_inc(&ds->ds_clp->cl_count);
hdr->ds_clp = ds->ds_clp;
hdr->ds_commit_idx = idx;
- fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
+ fh = nfs4_ff_layout_select_ds_fh(mirror);
if (fh)
hdr->args.fh = fh;
- if (vers == 4 &&
- !nfs4_ff_layout_select_ds_stateid(lseg, idx, &hdr->args.stateid))
- goto out_failed;
+ nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid);
/*
* Note that if we ever decide to split across DSes,
@@ -1849,6 +1908,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
struct pnfs_layout_segment *lseg = data->lseg;
struct nfs4_pnfs_ds *ds;
struct rpc_clnt *ds_clnt;
+ struct nfs4_ff_layout_mirror *mirror;
const struct cred *ds_cred;
u32 idx;
int vers, ret;
@@ -1859,20 +1919,21 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
goto out_err;
idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
- ds = nfs4_ff_layout_prepare_ds(lseg, idx, true);
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true);
if (!ds)
goto out_err;
- ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
+ ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
data->inode);
if (IS_ERR(ds_clnt))
goto out_err;
- ds_cred = ff_layout_get_ds_cred(lseg, idx, data->cred);
+ ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, data->cred);
if (!ds_cred)
goto out_err;
- vers = nfs4_ff_layout_ds_version(lseg, idx);
+ vers = nfs4_ff_layout_ds_version(mirror);
dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__,
data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count),
@@ -2036,7 +2097,7 @@ ff_layout_encode_layoutreturn(struct xdr_stream *xdr,
dprintk("%s: Begin\n", __func__);
- xdr_init_encode(&tmp_xdr, &tmp_buf, NULL);
+ xdr_init_encode(&tmp_xdr, &tmp_buf, NULL, NULL);
ff_layout_encode_ioerr(&tmp_xdr, args, ff_args);
ff_layout_encode_iostats_array(&tmp_xdr, args, ff_args);
@@ -2102,6 +2163,52 @@ out_nomem:
return -ENOMEM;
}
+#ifdef CONFIG_NFS_V4_2
+void
+ff_layout_send_layouterror(struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_layout_hdr *lo = lseg->pls_layout;
+ struct nfs42_layout_error *errors;
+ LIST_HEAD(head);
+
+ if (!nfs_server_capable(lo->plh_inode, NFS_CAP_LAYOUTERROR))
+ return;
+ ff_layout_fetch_ds_ioerr(lo, &lseg->pls_range, &head, -1);
+ if (list_empty(&head))
+ return;
+
+ errors = kmalloc_array(NFS42_LAYOUTERROR_MAX,
+ sizeof(*errors), GFP_NOFS);
+ if (errors != NULL) {
+ const struct nfs4_ff_layout_ds_err *pos;
+ size_t n = 0;
+
+ list_for_each_entry(pos, &head, list) {
+ errors[n].offset = pos->offset;
+ errors[n].length = pos->length;
+ nfs4_stateid_copy(&errors[n].stateid, &pos->stateid);
+ errors[n].errors[0].dev_id = pos->deviceid;
+ errors[n].errors[0].status = pos->status;
+ errors[n].errors[0].opnum = pos->opnum;
+ n++;
+ if (!list_is_last(&pos->list, &head) &&
+ n < NFS42_LAYOUTERROR_MAX)
+ continue;
+ if (nfs42_proc_layouterror(lseg, errors, n) < 0)
+ break;
+ n = 0;
+ }
+ kfree(errors);
+ }
+ ff_layout_free_ds_ioerr(&head);
+}
+#else
+void
+ff_layout_send_layouterror(struct pnfs_layout_segment *lseg)
+{
+}
+#endif
+
static int
ff_layout_ntop4(const struct sockaddr *sap, char *buf, const size_t buflen)
{
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index c2626bad466b..2f369966abf7 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -132,16 +132,6 @@ FF_LAYOUT_LSEG(struct pnfs_layout_segment *lseg)
generic_hdr);
}
-static inline struct nfs4_deviceid_node *
-FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx)
-{
- if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt ||
- FF_LAYOUT_LSEG(lseg)->mirror_array[idx] == NULL ||
- FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds == NULL)
- return NULL;
- return &FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds->id_node;
-}
-
static inline struct nfs4_ff_layout_ds *
FF_LAYOUT_MIRROR_DS(struct nfs4_deviceid_node *node)
{
@@ -151,9 +141,25 @@ FF_LAYOUT_MIRROR_DS(struct nfs4_deviceid_node *node)
static inline struct nfs4_ff_layout_mirror *
FF_LAYOUT_COMP(struct pnfs_layout_segment *lseg, u32 idx)
{
- if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt)
- return NULL;
- return FF_LAYOUT_LSEG(lseg)->mirror_array[idx];
+ struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
+
+ if (idx < fls->mirror_array_cnt)
+ return fls->mirror_array[idx];
+ return NULL;
+}
+
+static inline struct nfs4_deviceid_node *
+FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx)
+{
+ struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, idx);
+
+ if (mirror != NULL) {
+ struct nfs4_ff_layout_ds *mirror_ds = mirror->mirror_ds;
+
+ if (!IS_ERR_OR_NULL(mirror_ds))
+ return &mirror_ds->id_node;
+ }
+ return NULL;
}
static inline u32
@@ -174,28 +180,10 @@ ff_layout_no_read_on_rw(struct pnfs_layout_segment *lseg)
return FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_READ_IO;
}
-static inline bool
-ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node)
-{
- /*
- * Flexfiles should never mark a DS unavailable, but if it does
- * print a (ratelimited) warning as this can affect performance.
- */
- if (nfs4_test_deviceid_unavailable(node)) {
- u32 *p = (u32 *)node->deviceid.data;
-
- pr_warn_ratelimited("NFS: flexfiles layout referencing an "
- "unavailable device [%x%x%x%x]\n",
- p[0], p[1], p[2], p[3]);
- return true;
- }
- return false;
-}
-
static inline int
-nfs4_ff_layout_ds_version(struct pnfs_layout_segment *lseg, u32 ds_idx)
+nfs4_ff_layout_ds_version(const struct nfs4_ff_layout_mirror *mirror)
{
- return FF_LAYOUT_COMP(lseg, ds_idx)->mirror_ds->ds_versions[0].version;
+ return mirror->mirror_ds->ds_versions[0].version;
}
struct nfs4_ff_layout_ds *
@@ -207,6 +195,7 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
struct nfs4_ff_layout_mirror *mirror, u64 offset,
u64 length, int status, enum nfs_opnum4 opnum,
gfp_t gfp_flags);
+void ff_layout_send_layouterror(struct pnfs_layout_segment *lseg);
int ff_layout_encode_ds_ioerr(struct xdr_stream *xdr, const struct list_head *head);
void ff_layout_free_ds_ioerr(struct list_head *head);
unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo,
@@ -214,23 +203,23 @@ unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo,
struct list_head *head,
unsigned int maxnum);
struct nfs_fh *
-nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx);
-int
-nfs4_ff_layout_select_ds_stateid(struct pnfs_layout_segment *lseg,
- u32 mirror_idx,
- nfs4_stateid *stateid);
+nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror);
+void
+nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror,
+ nfs4_stateid *stateid);
struct nfs4_pnfs_ds *
-nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
+nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
+ struct nfs4_ff_layout_mirror *mirror,
bool fail_return);
struct rpc_clnt *
-nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg,
- u32 ds_idx,
+nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror,
struct nfs_client *ds_clp,
struct inode *inode);
-const struct cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg,
- u32 ds_idx, const struct cred *mdscred);
+const struct cred *ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror,
+ const struct pnfs_layout_range *range,
+ const struct cred *mdscred);
bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg);
bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg);
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index 11766a74216d..a809989807d6 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -183,56 +183,6 @@ out_err:
return NULL;
}
-static void ff_layout_mark_devid_invalid(struct pnfs_layout_segment *lseg,
- struct nfs4_deviceid_node *devid)
-{
- nfs4_delete_deviceid(devid->ld, devid->nfs_client, &devid->deviceid);
- if (!ff_layout_has_available_ds(lseg))
- pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
- lseg);
-}
-
-static bool ff_layout_mirror_valid(struct pnfs_layout_segment *lseg,
- struct nfs4_ff_layout_mirror *mirror,
- bool create)
-{
- if (mirror == NULL || IS_ERR(mirror->mirror_ds))
- goto outerr;
- if (mirror->mirror_ds == NULL) {
- if (create) {
- struct nfs4_deviceid_node *node;
- struct pnfs_layout_hdr *lh = lseg->pls_layout;
- struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV);
-
- node = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode),
- &mirror->devid, lh->plh_lc_cred,
- GFP_KERNEL);
- if (node)
- mirror_ds = FF_LAYOUT_MIRROR_DS(node);
-
- /* check for race with another call to this function */
- if (cmpxchg(&mirror->mirror_ds, NULL, mirror_ds) &&
- mirror_ds != ERR_PTR(-ENODEV))
- nfs4_put_deviceid_node(node);
- } else
- goto outerr;
- }
-
- if (IS_ERR(mirror->mirror_ds))
- goto outerr;
-
- if (mirror->mirror_ds->ds == NULL) {
- struct nfs4_deviceid_node *devid;
- devid = &mirror->mirror_ds->id_node;
- ff_layout_mark_devid_invalid(lseg, devid);
- return false;
- }
- return true;
-outerr:
- pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg);
- return false;
-}
-
static void extend_ds_error(struct nfs4_ff_layout_ds_err *err,
u64 offset, u64 length)
{
@@ -326,7 +276,6 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
spin_lock(&flo->generic_hdr.plh_inode->i_lock);
ff_layout_add_ds_error_locked(flo, dserr);
spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
-
return 0;
}
@@ -353,46 +302,54 @@ ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode)
}
struct nfs_fh *
-nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx)
+nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror)
{
- struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx);
- struct nfs_fh *fh = NULL;
-
- if (!ff_layout_mirror_valid(lseg, mirror, false)) {
- pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n",
- __func__, mirror_idx);
- goto out;
- }
-
/* FIXME: For now assume there is only 1 version available for the DS */
- fh = &mirror->fh_versions[0];
-out:
- return fh;
+ return &mirror->fh_versions[0];
}
-int
-nfs4_ff_layout_select_ds_stateid(struct pnfs_layout_segment *lseg,
- u32 mirror_idx,
- nfs4_stateid *stateid)
+void
+nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror,
+ nfs4_stateid *stateid)
{
- struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx);
+ if (nfs4_ff_layout_ds_version(mirror) == 4)
+ nfs4_stateid_copy(stateid, &mirror->stateid);
+}
- if (!ff_layout_mirror_valid(lseg, mirror, false)) {
- pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n",
- __func__, mirror_idx);
- goto out;
+static bool
+ff_layout_init_mirror_ds(struct pnfs_layout_hdr *lo,
+ struct nfs4_ff_layout_mirror *mirror)
+{
+ if (mirror == NULL)
+ goto outerr;
+ if (mirror->mirror_ds == NULL) {
+ struct nfs4_deviceid_node *node;
+ struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV);
+
+ node = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode),
+ &mirror->devid, lo->plh_lc_cred,
+ GFP_KERNEL);
+ if (node)
+ mirror_ds = FF_LAYOUT_MIRROR_DS(node);
+
+ /* check for race with another call to this function */
+ if (cmpxchg(&mirror->mirror_ds, NULL, mirror_ds) &&
+ mirror_ds != ERR_PTR(-ENODEV))
+ nfs4_put_deviceid_node(node);
}
- nfs4_stateid_copy(stateid, &mirror->stateid);
- return 1;
-out:
- return 0;
+ if (IS_ERR(mirror->mirror_ds))
+ goto outerr;
+
+ return true;
+outerr:
+ return false;
}
/**
* nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call
* @lseg: the layout segment we're operating on
- * @ds_idx: index of the DS to use
+ * @mirror: layout mirror describing the DS to use
* @fail_return: return layout on connect failure?
*
* Try to prepare a DS connection to accept an RPC call. This involves
@@ -407,26 +364,18 @@ out:
* Returns a pointer to a connected DS object on success or NULL on failure.
*/
struct nfs4_pnfs_ds *
-nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
+nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
+ struct nfs4_ff_layout_mirror *mirror,
bool fail_return)
{
- struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
struct nfs4_pnfs_ds *ds = NULL;
- struct nfs4_deviceid_node *devid;
struct inode *ino = lseg->pls_layout->plh_inode;
struct nfs_server *s = NFS_SERVER(ino);
unsigned int max_payload;
int status;
- if (!ff_layout_mirror_valid(lseg, mirror, true)) {
- pr_err_ratelimited("NFS: %s: No data server for offset index %d\n",
- __func__, ds_idx);
- goto out;
- }
-
- devid = &mirror->mirror_ds->id_node;
- if (ff_layout_test_devid_unavailable(devid))
- goto out_fail;
+ if (!ff_layout_init_mirror_ds(lseg->pls_layout, mirror))
+ goto noconnect;
ds = mirror->mirror_ds->ds;
/* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
@@ -437,8 +386,8 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
/* FIXME: For now we assume the server sent only one version of NFS
* to use for the DS.
*/
- status = nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo,
- dataserver_retrans,
+ status = nfs4_pnfs_ds_connect(s, ds, &mirror->mirror_ds->id_node,
+ dataserver_timeo, dataserver_retrans,
mirror->mirror_ds->ds_versions[0].version,
mirror->mirror_ds->ds_versions[0].minor_version);
@@ -453,11 +402,12 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
mirror->mirror_ds->ds_versions[0].wsize = max_payload;
goto out;
}
-out_fail:
+noconnect:
ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
mirror, lseg->pls_range.offset,
lseg->pls_range.length, NFS4ERR_NXIO,
OP_ILLEGAL, GFP_NOIO);
+ ff_layout_send_layouterror(lseg);
if (fail_return || !ff_layout_has_available_ds(lseg))
pnfs_error_mark_layout_for_return(ino, lseg);
ds = NULL;
@@ -466,14 +416,14 @@ out:
}
const struct cred *
-ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx,
+ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror,
+ const struct pnfs_layout_range *range,
const struct cred *mdscred)
{
- struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
const struct cred *cred;
if (mirror && !mirror->mirror_ds->ds_versions[0].tightly_coupled) {
- cred = ff_layout_get_mirror_cred(mirror, lseg->pls_range.iomode);
+ cred = ff_layout_get_mirror_cred(mirror, range->iomode);
if (!cred)
cred = get_cred(mdscred);
} else {
@@ -483,15 +433,18 @@ ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx,
}
/**
-* Find or create a DS rpc client with th MDS server rpc client auth flavor
-* in the nfs_client cl_ds_clients list.
-*/
+ * nfs4_ff_find_or_create_ds_client - Find or create a DS rpc client
+ * @mirror: pointer to the mirror
+ * @ds_clp: nfs_client for the DS
+ * @inode: pointer to inode
+ *
+ * Find or create a DS rpc client with th MDS server rpc client auth flavor
+ * in the nfs_client cl_ds_clients list.
+ */
struct rpc_clnt *
-nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, u32 ds_idx,
+nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror,
struct nfs_client *ds_clp, struct inode *inode)
{
- struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
-
switch (mirror->mirror_ds->ds_versions[0].version) {
case 3:
/* For NFSv3 DS, flavor is set when creating DS connections */
@@ -608,7 +561,7 @@ static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg)
if (IS_ERR(mirror->mirror_ds))
continue;
devid = &mirror->mirror_ds->id_node;
- if (!ff_layout_test_devid_unavailable(devid))
+ if (!nfs4_test_deviceid_unavailable(devid))
return true;
}
}
@@ -629,7 +582,7 @@ static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg)
if (!mirror->mirror_ds)
continue;
devid = &mirror->mirror_ds->id_node;
- if (ff_layout_test_devid_unavailable(devid))
+ if (nfs4_test_deviceid_unavailable(devid))
return false;
}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 094775ea0781..414a90d48493 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -143,6 +143,7 @@ EXPORT_SYMBOL_GPL(nfs_sync_inode);
/**
* nfs_sync_mapping - helper to flush all mmapped dirty data to disk
+ * @mapping: pointer to struct address_space
*/
int nfs_sync_mapping(struct address_space *mapping)
{
@@ -1184,8 +1185,8 @@ int nfs_attribute_cache_expired(struct inode *inode)
/**
* nfs_revalidate_inode - Revalidate the inode attributes
- * @server - pointer to nfs_server struct
- * @inode - pointer to inode struct
+ * @server: pointer to nfs_server struct
+ * @inode: pointer to inode struct
*
* Updates inode attribute information by retrieving the data from the server.
*/
@@ -1255,8 +1256,8 @@ out:
/**
* nfs_revalidate_mapping - Revalidate the pagecache
- * @inode - pointer to host inode
- * @mapping - pointer to mapping
+ * @inode: pointer to host inode
+ * @mapping: pointer to mapping
*/
int nfs_revalidate_mapping(struct inode *inode,
struct address_space *mapping)
@@ -1371,8 +1372,8 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
/**
* nfs_check_inode_attributes - verify consistency of the inode attribute cache
- * @inode - pointer to inode
- * @fattr - updated attributes
+ * @inode: pointer to inode
+ * @fattr: updated attributes
*
* Verifies the attribute cache. If we have just changed the attributes,
* so that fattr carries weak cache consistency data, then it may
@@ -1572,8 +1573,8 @@ EXPORT_SYMBOL_GPL(_nfs_display_fhandle);
/**
* nfs_inode_attrs_need_update - check if the inode attributes need updating
- * @inode - pointer to inode
- * @fattr - attributes
+ * @inode: pointer to inode
+ * @fattr: attributes
*
* Attempt to divine whether or not an RPC call reply carrying stale
* attributes got scheduled after another call carrying updated ones.
@@ -1614,8 +1615,8 @@ static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr
/**
* nfs_refresh_inode - try to update the inode attribute cache
- * @inode - pointer to inode
- * @fattr - updated attributes
+ * @inode: pointer to inode
+ * @fattr: updated attributes
*
* Check that an RPC call that returned attributes has not overlapped with
* other recent updates of the inode metadata, then decide whether it is
@@ -1649,8 +1650,8 @@ static int nfs_post_op_update_inode_locked(struct inode *inode,
/**
* nfs_post_op_update_inode - try to update the inode attribute cache
- * @inode - pointer to inode
- * @fattr - updated attributes
+ * @inode: pointer to inode
+ * @fattr: updated attributes
*
* After an operation that has changed the inode metadata, mark the
* attribute cache as being invalid, then try to update it.
@@ -1679,8 +1680,8 @@ EXPORT_SYMBOL_GPL(nfs_post_op_update_inode);
/**
* nfs_post_op_update_inode_force_wcc_locked - update the inode attribute cache
- * @inode - pointer to inode
- * @fattr - updated attributes
+ * @inode: pointer to inode
+ * @fattr: updated attributes
*
* After an operation that has changed the inode metadata, mark the
* attribute cache as being invalid, then try to update it. Fake up
@@ -1731,8 +1732,8 @@ out_noforce:
/**
* nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache
- * @inode - pointer to inode
- * @fattr - updated attributes
+ * @inode: pointer to inode
+ * @fattr: updated attributes
*
* After an operation that has changed the inode metadata, mark the
* attribute cache as being invalid, then try to update it. Fake up
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index b1e577302518..c7cf23ae6597 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -69,7 +69,8 @@ struct nfs_clone_mount {
* Maximum number of pages that readdir can use for creating
* a vmapped array of pages.
*/
-#define NFS_MAX_READDIR_PAGES 8
+#define NFS_MAX_READDIR_PAGES 64
+#define NFS_MAX_READDIR_RAPAGES 8
struct nfs_client_initdata {
unsigned long init_flags;
@@ -755,6 +756,7 @@ static inline bool nfs_error_is_fatal(int err)
{
switch (err) {
case -ERESTARTSYS:
+ case -EINTR:
case -EACCES:
case -EDQUOT:
case -EFBIG:
@@ -763,6 +765,7 @@ static inline bool nfs_error_is_fatal(int err)
case -EROFS:
case -ESTALE:
case -E2BIG:
+ case -ENOMEM:
return true;
default:
return false;
diff --git a/fs/nfs/io.c b/fs/nfs/io.c
index 9034b4926909..5088fda9b453 100644
--- a/fs/nfs/io.c
+++ b/fs/nfs/io.c
@@ -25,7 +25,7 @@ static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode)
/**
* nfs_start_io_read - declare the file is being used for buffered reads
- * @inode - file inode
+ * @inode: file inode
*
* Declare that a buffered read operation is about to start, and ensure
* that we block all direct I/O.
@@ -56,7 +56,7 @@ nfs_start_io_read(struct inode *inode)
/**
* nfs_end_io_read - declare that the buffered read operation is done
- * @inode - file inode
+ * @inode: file inode
*
* Declare that a buffered read operation is done, and release the shared
* lock on inode->i_rwsem.
@@ -69,7 +69,7 @@ nfs_end_io_read(struct inode *inode)
/**
* nfs_start_io_write - declare the file is being used for buffered writes
- * @inode - file inode
+ * @inode: file inode
*
* Declare that a buffered read operation is about to start, and ensure
* that we block all direct I/O.
@@ -83,7 +83,7 @@ nfs_start_io_write(struct inode *inode)
/**
* nfs_end_io_write - declare that the buffered write operation is done
- * @inode - file inode
+ * @inode: file inode
*
* Declare that a buffered write operation is done, and release the
* lock on inode->i_rwsem.
@@ -105,7 +105,7 @@ static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode)
/**
* nfs_end_io_direct - declare the file is being used for direct i/o
- * @inode - file inode
+ * @inode: file inode
*
* Declare that a direct I/O operation is about to start, and ensure
* that we block all buffered I/O.
@@ -136,7 +136,7 @@ nfs_start_io_direct(struct inode *inode)
/**
* nfs_end_io_direct - declare that the direct i/o operation is done
- * @inode - file inode
+ * @inode: file inode
*
* Declare that a direct I/O operation is done, and release the shared
* lock on inode->i_rwsem.
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index e5686be67be8..15f099a24c29 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -221,10 +221,10 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
/**
* nfs_do_submount - set up mountpoint when crossing a filesystem boundary
- * @dentry - parent directory
- * @fh - filehandle for new root dentry
- * @fattr - attributes for new root inode
- * @authflavor - security flavor to use when performing the mount
+ * @dentry: parent directory
+ * @fh: filehandle for new root dentry
+ * @fattr: attributes for new root inode
+ * @authflavor: security flavor to use when performing the mount
*
*/
struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh,
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 350675e3ed47..a7ed29de0a40 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -22,6 +22,7 @@
#include <linux/nfs.h>
#include <linux/nfs2.h>
#include <linux/nfs_fs.h>
+#include "nfstrace.h"
#include "internal.h"
#define NFSDBG_FACILITY NFSDBG_XDR
@@ -55,42 +56,16 @@
#define NFS_attrstat_sz (1+NFS_fattr_sz)
#define NFS_diropres_sz (1+NFS_fhandle_sz+NFS_fattr_sz)
-#define NFS_readlinkres_sz (2)
-#define NFS_readres_sz (1+NFS_fattr_sz+1)
+#define NFS_readlinkres_sz (2+1)
+#define NFS_readres_sz (1+NFS_fattr_sz+1+1)
#define NFS_writeres_sz (NFS_attrstat_sz)
#define NFS_stat_sz (1)
-#define NFS_readdirres_sz (1)
+#define NFS_readdirres_sz (1+1)
#define NFS_statfsres_sz (1+NFS_info_sz)
static int nfs_stat_to_errno(enum nfs_stat);
/*
- * While encoding arguments, set up the reply buffer in advance to
- * receive reply data directly into the page cache.
- */
-static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
- unsigned int base, unsigned int len,
- unsigned int bufsize)
-{
- struct rpc_auth *auth = req->rq_cred->cr_auth;
- unsigned int replen;
-
- replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
- xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
-}
-
-/*
- * Handle decode buffer overflows out-of-line.
- */
-static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-{
- dprintk("NFS: %s prematurely hit the end of our receive buffer. "
- "Remaining buffer length is %tu words.\n",
- func, xdr->end - xdr->p);
-}
-
-
-/*
* Encode/decode NFSv2 basic data types
*
* Basic NFSv2 data types are defined in section 2.3 of RFC 1094:
@@ -110,8 +85,8 @@ static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_pgio_res *result)
__be32 *p;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
count = be32_to_cpup(p);
recvd = xdr_read_pages(xdr, count);
if (unlikely(count > recvd))
@@ -125,9 +100,6 @@ out_cheating:
"count %u > recvd %u\n", count, recvd);
count = recvd;
goto out;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -157,13 +129,16 @@ static int decode_stat(struct xdr_stream *xdr, enum nfs_stat *status)
__be32 *p;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
+ if (unlikely(*p != cpu_to_be32(NFS_OK)))
+ goto out_status;
+ *status = 0;
+ return 0;
+out_status:
*status = be32_to_cpup(p);
+ trace_nfs_xdr_status((int)*status);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -205,14 +180,11 @@ static int decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh)
__be32 *p;
p = xdr_inline_decode(xdr, NFS2_FHSIZE);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
fh->size = NFS2_FHSIZE;
memcpy(fh->data, p, NFS2_FHSIZE);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -282,8 +254,8 @@ static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
__be32 *p;
p = xdr_inline_decode(xdr, NFS_fattr_sz << 2);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
fattr->valid |= NFS_ATTR_FATTR_V2;
@@ -325,9 +297,6 @@ out_uid:
out_gid:
dprintk("NFS: returned invalid gid\n");
return -EINVAL;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -416,23 +385,20 @@ static int decode_filename_inline(struct xdr_stream *xdr,
u32 count;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
count = be32_to_cpup(p);
if (count > NFS3_MAXNAMLEN)
goto out_nametoolong;
p = xdr_inline_decode(xdr, count);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
*name = (const char *)p;
*length = count;
return 0;
out_nametoolong:
dprintk("NFS: returned filename too long: %u\n", count);
return -ENAMETOOLONG;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -455,8 +421,8 @@ static int decode_path(struct xdr_stream *xdr)
__be32 *p;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
length = be32_to_cpup(p);
if (unlikely(length >= xdr->buf->page_len || length > NFS_MAXPATHLEN))
goto out_size;
@@ -472,9 +438,6 @@ out_cheating:
dprintk("NFS: server cheating in pathname result: "
"length %u > received %u\n", length, recvd);
return -EIO;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -615,8 +578,8 @@ static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req,
const struct nfs_readlinkargs *args = data;
encode_fhandle(xdr, args->fh);
- prepare_reply_buffer(req, args->pages, args->pgbase,
- args->pglen, NFS_readlinkres_sz);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->pglen, NFS_readlinkres_sz);
}
/*
@@ -651,8 +614,8 @@ static void nfs2_xdr_enc_readargs(struct rpc_rqst *req,
const struct nfs_pgio_args *args = data;
encode_readargs(xdr, args);
- prepare_reply_buffer(req, args->pages, args->pgbase,
- args->count, NFS_readres_sz);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->count, NFS_readres_sz);
req->rq_rcv_buf.flags |= XDRBUF_READ;
}
@@ -809,8 +772,8 @@ static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req,
const struct nfs_readdirargs *args = data;
encode_readdirargs(xdr, args);
- prepare_reply_buffer(req, args->pages, 0,
- args->count, NFS_readdirres_sz);
+ rpc_prepare_reply_pages(req, args->pages, 0,
+ args->count, NFS_readdirres_sz);
}
/*
@@ -951,12 +914,12 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
int error;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EAGAIN;
if (*p++ == xdr_zero) {
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EAGAIN;
if (*p++ == xdr_zero)
return -EAGAIN;
entry->eof = 1;
@@ -964,8 +927,8 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
}
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EAGAIN;
entry->ino = be32_to_cpup(p);
error = decode_filename_inline(xdr, &entry->name, &entry->len);
@@ -978,17 +941,13 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
*/
entry->prev_cookie = entry->cookie;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EAGAIN;
entry->cookie = be32_to_cpup(p);
entry->d_type = DT_UNKNOWN;
return 0;
-
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EAGAIN;
}
/*
@@ -1052,17 +1011,14 @@ static int decode_info(struct xdr_stream *xdr, struct nfs2_fsstat *result)
__be32 *p;
p = xdr_inline_decode(xdr, NFS_info_sz << 2);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
result->tsize = be32_to_cpup(p++);
result->bsize = be32_to_cpup(p++);
result->blocks = be32_to_cpup(p++);
result->bfree = be32_to_cpup(p++);
result->bavail = be32_to_cpup(p);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int nfs2_xdr_dec_statfsres(struct rpc_rqst *req, struct xdr_stream *xdr,
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 9fce18548f7e..c5c3fc6e6c60 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -222,8 +222,6 @@ static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
switch (status) {
case 0:
status = nfs_refresh_inode(inode, fattr);
- set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
- set_cached_acl(inode, ACL_TYPE_DEFAULT, dfacl);
break;
case -EPFNOSUPPORT:
case -EPROTONOSUPPORT:
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 78df4eb60f85..110358f4986d 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -21,6 +21,7 @@
#include <linux/nfs3.h>
#include <linux/nfs_fs.h>
#include <linux/nfsacl.h>
+#include "nfstrace.h"
#include "internal.h"
#define NFSDBG_FACILITY NFSDBG_XDR
@@ -68,13 +69,13 @@
#define NFS3_removeres_sz (NFS3_setattrres_sz)
#define NFS3_lookupres_sz (1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz))
#define NFS3_accessres_sz (1+NFS3_post_op_attr_sz+1)
-#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1)
-#define NFS3_readres_sz (1+NFS3_post_op_attr_sz+3)
+#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1+1)
+#define NFS3_readres_sz (1+NFS3_post_op_attr_sz+3+1)
#define NFS3_writeres_sz (1+NFS3_wcc_data_sz+4)
#define NFS3_createres_sz (1+NFS3_fh_sz+NFS3_post_op_attr_sz+NFS3_wcc_data_sz)
#define NFS3_renameres_sz (1+(2 * NFS3_wcc_data_sz))
#define NFS3_linkres_sz (1+NFS3_post_op_attr_sz+NFS3_wcc_data_sz)
-#define NFS3_readdirres_sz (1+NFS3_post_op_attr_sz+2)
+#define NFS3_readdirres_sz (1+NFS3_post_op_attr_sz+2+1)
#define NFS3_fsstatres_sz (1+NFS3_post_op_attr_sz+13)
#define NFS3_fsinfores_sz (1+NFS3_post_op_attr_sz+12)
#define NFS3_pathconfres_sz (1+NFS3_post_op_attr_sz+6)
@@ -84,7 +85,7 @@
#define ACL3_setaclargs_sz (NFS3_fh_sz+1+ \
XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))
#define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+ \
- XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))
+ XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)+1)
#define ACL3_setaclres_sz (1+NFS3_post_op_attr_sz)
static int nfs3_stat_to_errno(enum nfs_stat);
@@ -104,32 +105,6 @@ static const umode_t nfs_type2fmt[] = {
};
/*
- * While encoding arguments, set up the reply buffer in advance to
- * receive reply data directly into the page cache.
- */
-static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
- unsigned int base, unsigned int len,
- unsigned int bufsize)
-{
- struct rpc_auth *auth = req->rq_cred->cr_auth;
- unsigned int replen;
-
- replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
- xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
-}
-
-/*
- * Handle decode buffer overflows out-of-line.
- */
-static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-{
- dprintk("NFS: %s prematurely hit the end of our receive buffer. "
- "Remaining buffer length is %tu words.\n",
- func, xdr->end - xdr->p);
-}
-
-
-/*
* Encode/decode NFSv3 basic data types
*
* Basic NFSv3 data types are defined in section 2.5 of RFC 1813:
@@ -151,13 +126,10 @@ static int decode_uint32(struct xdr_stream *xdr, u32 *value)
__be32 *p;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
*value = be32_to_cpup(p);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_uint64(struct xdr_stream *xdr, u64 *value)
@@ -165,13 +137,10 @@ static int decode_uint64(struct xdr_stream *xdr, u64 *value)
__be32 *p;
p = xdr_inline_decode(xdr, 8);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
xdr_decode_hyper(p, value);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -211,14 +180,14 @@ static int decode_inline_filename3(struct xdr_stream *xdr,
u32 count;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
count = be32_to_cpup(p);
if (count > NFS3_MAXNAMLEN)
goto out_nametoolong;
p = xdr_inline_decode(xdr, count);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
*name = (const char *)p;
*length = count;
return 0;
@@ -226,9 +195,6 @@ static int decode_inline_filename3(struct xdr_stream *xdr,
out_nametoolong:
dprintk("NFS: returned filename too long: %u\n", count);
return -ENAMETOOLONG;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -249,8 +215,8 @@ static int decode_nfspath3(struct xdr_stream *xdr)
__be32 *p;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
count = be32_to_cpup(p);
if (unlikely(count >= xdr->buf->page_len || count > NFS3_MAXPATHLEN))
goto out_nametoolong;
@@ -267,9 +233,6 @@ out_cheating:
dprintk("NFS: server cheating in pathname result: "
"count %u > recvd %u\n", count, recvd);
return -EIO;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -303,13 +266,10 @@ static int decode_cookieverf3(struct xdr_stream *xdr, __be32 *verifier)
__be32 *p;
p = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
memcpy(verifier, p, NFS3_COOKIEVERFSIZE);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -330,13 +290,10 @@ static int decode_writeverf3(struct xdr_stream *xdr, struct nfs_write_verifier *
__be32 *p;
p = xdr_inline_decode(xdr, NFS3_WRITEVERFSIZE);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
memcpy(verifier->data, p, NFS3_WRITEVERFSIZE);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -364,13 +321,16 @@ static int decode_nfsstat3(struct xdr_stream *xdr, enum nfs_stat *status)
__be32 *p;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
+ if (unlikely(*p != cpu_to_be32(NFS3_OK)))
+ goto out_status;
+ *status = 0;
+ return 0;
+out_status:
*status = be32_to_cpup(p);
+ trace_nfs_xdr_status((int)*status);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -453,23 +413,20 @@ static int decode_nfs_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
__be32 *p;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
length = be32_to_cpup(p++);
if (unlikely(length > NFS3_FHSIZE))
goto out_toobig;
p = xdr_inline_decode(xdr, length);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
fh->size = length;
memcpy(fh->data, p, length);
return 0;
out_toobig:
dprintk("NFS: file handle size (%u) too big\n", length);
return -E2BIG;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static void zero_nfs_fh3(struct nfs_fh *fh)
@@ -655,8 +612,8 @@ static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr)
__be32 *p;
p = xdr_inline_decode(xdr, NFS3_fattr_sz << 2);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
p = xdr_decode_ftype3(p, &fmode);
@@ -690,9 +647,6 @@ out_uid:
out_gid:
dprintk("NFS: returned invalid gid\n");
return -EINVAL;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -710,14 +664,11 @@ static int decode_post_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
__be32 *p;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
if (*p != xdr_zero)
return decode_fattr3(xdr, fattr);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -733,8 +684,8 @@ static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
__be32 *p;
p = xdr_inline_decode(xdr, NFS3_wcc_attr_sz << 2);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
fattr->valid |= NFS_ATTR_FATTR_PRESIZE
| NFS_ATTR_FATTR_PRECHANGE
@@ -747,9 +698,6 @@ static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
fattr->pre_change_attr = nfs_timespec_to_change_attr(&fattr->pre_ctime);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -773,14 +721,11 @@ static int decode_pre_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
__be32 *p;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
if (*p != xdr_zero)
return decode_wcc_attr(xdr, fattr);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_wcc_data(struct xdr_stream *xdr, struct nfs_fattr *fattr)
@@ -808,15 +753,12 @@ out:
static int decode_post_op_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
{
__be32 *p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
if (*p != xdr_zero)
return decode_nfs_fh3(xdr, fh);
zero_nfs_fh3(fh);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -953,8 +895,8 @@ static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req,
const struct nfs3_readlinkargs *args = data;
encode_nfs_fh3(xdr, args->fh);
- prepare_reply_buffer(req, args->pages, args->pgbase,
- args->pglen, NFS3_readlinkres_sz);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->pglen, NFS3_readlinkres_sz);
}
/*
@@ -986,8 +928,8 @@ static void nfs3_xdr_enc_read3args(struct rpc_rqst *req,
unsigned int replen = args->replen ? args->replen : NFS3_readres_sz;
encode_read3args(xdr, args);
- prepare_reply_buffer(req, args->pages, args->pgbase,
- args->count, replen);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->count, replen);
req->rq_rcv_buf.flags |= XDRBUF_READ;
}
@@ -1279,7 +1221,7 @@ static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req,
const struct nfs3_readdirargs *args = data;
encode_readdir3args(xdr, args);
- prepare_reply_buffer(req, args->pages, 0,
+ rpc_prepare_reply_pages(req, args->pages, 0,
args->count, NFS3_readdirres_sz);
}
@@ -1321,7 +1263,7 @@ static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req,
const struct nfs3_readdirargs *args = data;
encode_readdirplus3args(xdr, args);
- prepare_reply_buffer(req, args->pages, 0,
+ rpc_prepare_reply_pages(req, args->pages, 0,
args->count, NFS3_readdirres_sz);
}
@@ -1366,7 +1308,7 @@ static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req,
encode_nfs_fh3(xdr, args->fh);
encode_uint32(xdr, args->mask);
if (args->mask & (NFS_ACL | NFS_DFACL)) {
- prepare_reply_buffer(req, args->pages, 0,
+ rpc_prepare_reply_pages(req, args->pages, 0,
NFSACL_MAXPAGES << PAGE_SHIFT,
ACL3_getaclres_sz);
req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES;
@@ -1643,8 +1585,8 @@ static int decode_read3resok(struct xdr_stream *xdr,
__be32 *p;
p = xdr_inline_decode(xdr, 4 + 4 + 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
count = be32_to_cpup(p++);
eof = be32_to_cpup(p++);
ocount = be32_to_cpup(p++);
@@ -1667,9 +1609,6 @@ out_cheating:
count = recvd;
eof = 0;
goto out;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
@@ -1690,7 +1629,7 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
result->op_status = status;
if (status != NFS3_OK)
goto out_status;
- result->replen = 3 + ((xdr_stream_pos(xdr) - pos) >> 2);
+ result->replen = 4 + ((xdr_stream_pos(xdr) - pos) >> 2);
error = decode_read3resok(xdr, result);
out:
return error;
@@ -1731,22 +1670,18 @@ static int decode_write3resok(struct xdr_stream *xdr,
__be32 *p;
p = xdr_inline_decode(xdr, 4 + 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
result->count = be32_to_cpup(p++);
result->verf->committed = be32_to_cpup(p++);
if (unlikely(result->verf->committed > NFS_FILE_SYNC))
goto out_badvalue;
if (decode_writeverf3(xdr, &result->verf->verifier))
- goto out_eio;
+ return -EIO;
return result->count;
out_badvalue:
dprintk("NFS: bad stable_how value: %u\n", result->verf->committed);
return -EIO;
-out_overflow:
- print_overflow_msg(__func__, xdr);
-out_eio:
- return -EIO;
}
static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr,
@@ -2010,12 +1945,12 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
u64 new_cookie;
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EAGAIN;
if (*p == xdr_zero) {
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EAGAIN;
if (*p == xdr_zero)
return -EAGAIN;
entry->eof = 1;
@@ -2051,8 +1986,8 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
/* In fact, a post_op_fh3: */
p = xdr_inline_decode(xdr, 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EAGAIN;
if (*p != xdr_zero) {
error = decode_nfs_fh3(xdr, entry->fh);
if (unlikely(error)) {
@@ -2069,9 +2004,6 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EAGAIN;
out_truncated:
dprintk("NFS: directory entry contains invalid file handle\n");
*entry = old;
@@ -2183,8 +2115,8 @@ static int decode_fsstat3resok(struct xdr_stream *xdr,
__be32 *p;
p = xdr_inline_decode(xdr, 8 * 6 + 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
p = xdr_decode_size3(p, &result->tbytes);
p = xdr_decode_size3(p, &result->fbytes);
p = xdr_decode_size3(p, &result->abytes);
@@ -2193,9 +2125,6 @@ static int decode_fsstat3resok(struct xdr_stream *xdr,
xdr_decode_size3(p, &result->afiles);
/* ignore invarsec */
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req,
@@ -2255,8 +2184,8 @@ static int decode_fsinfo3resok(struct xdr_stream *xdr,
__be32 *p;
p = xdr_inline_decode(xdr, 4 * 7 + 8 + 8 + 4);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
result->rtmax = be32_to_cpup(p++);
result->rtpref = be32_to_cpup(p++);
result->rtmult = be32_to_cpup(p++);
@@ -2270,9 +2199,6 @@ static int decode_fsinfo3resok(struct xdr_stream *xdr,
/* ignore properties */
result->lease_time = 0;
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req,
@@ -2328,15 +2254,12 @@ static int decode_pathconf3resok(struct xdr_stream *xdr,
__be32 *p;
p = xdr_inline_decode(xdr, 4 * 6);
- if (unlikely(p == NULL))
- goto out_overflow;
+ if (unlikely(!p))
+ return -EIO;
result->max_link = be32_to_cpup(p++);
result->max_namelen = be32_to_cpup(p);
/* ignore remaining fields */
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req,
diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index 19ec38f85ce0..901cca7542f9 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -20,5 +20,8 @@ loff_t nfs42_proc_llseek(struct file *, loff_t, int);
int nfs42_proc_layoutstats_generic(struct nfs_server *,
struct nfs42_layoutstat_data *);
int nfs42_proc_clone(struct file *, struct file *, loff_t, loff_t, loff_t);
+int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg,
+ const struct nfs42_layout_error *errors,
+ size_t n);
#endif /* __LINUX_FS_NFS_NFS4_2_H */
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index fed06fd9998d..ff6f85fb676b 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -672,6 +672,170 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server,
return 0;
}
+static struct nfs42_layouterror_data *
+nfs42_alloc_layouterror_data(struct pnfs_layout_segment *lseg, gfp_t gfp_flags)
+{
+ struct nfs42_layouterror_data *data;
+ struct inode *inode = lseg->pls_layout->plh_inode;
+
+ data = kzalloc(sizeof(*data), gfp_flags);
+ if (data) {
+ data->args.inode = data->inode = nfs_igrab_and_active(inode);
+ if (data->inode) {
+ data->lseg = pnfs_get_lseg(lseg);
+ if (data->lseg)
+ return data;
+ nfs_iput_and_deactive(data->inode);
+ }
+ kfree(data);
+ }
+ return NULL;
+}
+
+static void
+nfs42_free_layouterror_data(struct nfs42_layouterror_data *data)
+{
+ pnfs_put_lseg(data->lseg);
+ nfs_iput_and_deactive(data->inode);
+ kfree(data);
+}
+
+static void
+nfs42_layouterror_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs42_layouterror_data *data = calldata;
+ struct inode *inode = data->inode;
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct pnfs_layout_hdr *lo = data->lseg->pls_layout;
+ unsigned i;
+
+ spin_lock(&inode->i_lock);
+ if (!pnfs_layout_is_valid(lo)) {
+ spin_unlock(&inode->i_lock);
+ rpc_exit(task, 0);
+ return;
+ }
+ for (i = 0; i < data->args.num_errors; i++)
+ nfs4_stateid_copy(&data->args.errors[i].stateid,
+ &lo->plh_stateid);
+ spin_unlock(&inode->i_lock);
+ nfs4_setup_sequence(server->nfs_client, &data->args.seq_args,
+ &data->res.seq_res, task);
+}
+
+static void
+nfs42_layouterror_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs42_layouterror_data *data = calldata;
+ struct inode *inode = data->inode;
+ struct pnfs_layout_hdr *lo = data->lseg->pls_layout;
+
+ if (!nfs4_sequence_done(task, &data->res.seq_res))
+ return;
+
+ switch (task->tk_status) {
+ case 0:
+ break;
+ case -NFS4ERR_BADHANDLE:
+ case -ESTALE:
+ pnfs_destroy_layout(NFS_I(inode));
+ break;
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_DELEG_REVOKED:
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_BAD_STATEID:
+ spin_lock(&inode->i_lock);
+ if (pnfs_layout_is_valid(lo) &&
+ nfs4_stateid_match(&data->args.errors[0].stateid,
+ &lo->plh_stateid)) {
+ LIST_HEAD(head);
+
+ /*
+ * Mark the bad layout state as invalid, then retry
+ * with the current stateid.
+ */
+ pnfs_mark_layout_stateid_invalid(lo, &head);
+ spin_unlock(&inode->i_lock);
+ pnfs_free_lseg_list(&head);
+ nfs_commit_inode(inode, 0);
+ } else
+ spin_unlock(&inode->i_lock);
+ break;
+ case -NFS4ERR_OLD_STATEID:
+ spin_lock(&inode->i_lock);
+ if (pnfs_layout_is_valid(lo) &&
+ nfs4_stateid_match_other(&data->args.errors[0].stateid,
+ &lo->plh_stateid)) {
+ /* Do we need to delay before resending? */
+ if (!nfs4_stateid_is_newer(&lo->plh_stateid,
+ &data->args.errors[0].stateid))
+ rpc_delay(task, HZ);
+ rpc_restart_call_prepare(task);
+ }
+ spin_unlock(&inode->i_lock);
+ break;
+ case -ENOTSUPP:
+ case -EOPNOTSUPP:
+ NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTERROR;
+ }
+}
+
+static void
+nfs42_layouterror_release(void *calldata)
+{
+ struct nfs42_layouterror_data *data = calldata;
+
+ nfs42_free_layouterror_data(data);
+}
+
+static const struct rpc_call_ops nfs42_layouterror_ops = {
+ .rpc_call_prepare = nfs42_layouterror_prepare,
+ .rpc_call_done = nfs42_layouterror_done,
+ .rpc_release = nfs42_layouterror_release,
+};
+
+int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg,
+ const struct nfs42_layout_error *errors, size_t n)
+{
+ struct inode *inode = lseg->pls_layout->plh_inode;
+ struct nfs42_layouterror_data *data;
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTERROR],
+ };
+ struct rpc_task_setup task_setup = {
+ .rpc_message = &msg,
+ .callback_ops = &nfs42_layouterror_ops,
+ .flags = RPC_TASK_ASYNC,
+ };
+ unsigned int i;
+
+ if (!nfs_server_capable(inode, NFS_CAP_LAYOUTERROR))
+ return -EOPNOTSUPP;
+ if (n > NFS42_LAYOUTERROR_MAX)
+ return -EINVAL;
+ data = nfs42_alloc_layouterror_data(lseg, GFP_NOFS);
+ if (!data)
+ return -ENOMEM;
+ for (i = 0; i < n; i++) {
+ data->args.errors[i] = errors[i];
+ data->args.num_errors++;
+ data->res.num_errors++;
+ }
+ msg.rpc_argp = &data->args;
+ msg.rpc_resp = &data->res;
+ task_setup.callback_data = data;
+ task_setup.rpc_client = NFS_SERVER(inode)->client;
+ nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0, 0);
+ task = rpc_run_task(&task_setup);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ rpc_put_task(task);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs42_proc_layouterror);
+
static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
struct file *dst_f, struct nfs_lock_context *src_lock,
struct nfs_lock_context *dst_lock, loff_t src_offset,
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 69f72ed2bf87..aed865a84629 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -51,6 +51,15 @@
1 /* opaque devaddr4 length */ + \
XDR_QUADLEN(PNFS_LAYOUTSTATS_MAXSIZE))
#define decode_layoutstats_maxsz (op_decode_hdr_maxsz)
+#define encode_device_error_maxsz (XDR_QUADLEN(NFS4_DEVICEID4_SIZE) + \
+ 1 /* status */ + 1 /* opnum */)
+#define encode_layouterror_maxsz (op_decode_hdr_maxsz + \
+ 2 /* offset */ + \
+ 2 /* length */ + \
+ encode_stateid_maxsz + \
+ 1 /* Array size */ + \
+ encode_device_error_maxsz)
+#define decode_layouterror_maxsz (op_decode_hdr_maxsz)
#define encode_clone_maxsz (encode_stateid_maxsz + \
encode_stateid_maxsz + \
2 /* src offset */ + \
@@ -59,43 +68,53 @@
#define decode_clone_maxsz (op_decode_hdr_maxsz)
#define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
encode_putfh_maxsz + \
encode_allocate_maxsz + \
encode_getattr_maxsz)
#define NFS4_dec_allocate_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
decode_putfh_maxsz + \
decode_allocate_maxsz + \
decode_getattr_maxsz)
#define NFS4_enc_copy_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
encode_putfh_maxsz + \
encode_savefh_maxsz + \
encode_putfh_maxsz + \
encode_copy_maxsz + \
encode_commit_maxsz)
#define NFS4_dec_copy_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
decode_putfh_maxsz + \
decode_savefh_maxsz + \
decode_putfh_maxsz + \
decode_copy_maxsz + \
decode_commit_maxsz)
#define NFS4_enc_offload_cancel_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
encode_putfh_maxsz + \
encode_offload_cancel_maxsz)
#define NFS4_dec_offload_cancel_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
decode_putfh_maxsz + \
decode_offload_cancel_maxsz)
#define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
encode_putfh_maxsz + \
encode_deallocate_maxsz + \
encode_getattr_maxsz)
#define NFS4_dec_deallocate_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
decode_putfh_maxsz + \
decode_deallocate_maxsz + \
decode_getattr_maxsz)
#define NFS4_enc_seek_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
encode_putfh_maxsz + \
encode_seek_maxsz)
#define NFS4_dec_seek_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
decode_putfh_maxsz + \
decode_seek_maxsz)
#define NFS4_enc_layoutstats_sz (compound_encode_hdr_maxsz + \
@@ -106,6 +125,16 @@
decode_sequence_maxsz + \
decode_putfh_maxsz + \
PNFS_LAYOUTSTATS_MAXDEV * decode_layoutstats_maxsz)
+#define NFS4_enc_layouterror_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ NFS42_LAYOUTERROR_MAX * \
+ encode_layouterror_maxsz)
+#define NFS4_dec_layouterror_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ NFS42_LAYOUTERROR_MAX * \
+ decode_layouterror_maxsz)
#define NFS4_enc_clone_sz (compound_encode_hdr_maxsz + \
encode_sequence_maxsz + \
encode_putfh_maxsz + \
@@ -223,6 +252,34 @@ static void encode_clone(struct xdr_stream *xdr,
xdr_encode_hyper(p, args->count);
}
+static void encode_device_error(struct xdr_stream *xdr,
+ const struct nfs42_device_error *error)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, NFS4_DEVICEID4_SIZE + 2*4);
+ p = xdr_encode_opaque_fixed(p, error->dev_id.data,
+ NFS4_DEVICEID4_SIZE);
+ *p++ = cpu_to_be32(error->status);
+ *p = cpu_to_be32(error->opnum);
+}
+
+static void encode_layouterror(struct xdr_stream *xdr,
+ const struct nfs42_layout_error *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_LAYOUTERROR, decode_layouterror_maxsz, hdr);
+ p = reserve_space(xdr, 8 + 8);
+ p = xdr_encode_hyper(p, args->offset);
+ p = xdr_encode_hyper(p, args->length);
+ encode_nfs4_stateid(xdr, &args->stateid);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(1);
+ encode_device_error(xdr, &args->errors[0]);
+}
+
/*
* Encode ALLOCATE request
*/
@@ -381,6 +438,27 @@ static void nfs4_xdr_enc_clone(struct rpc_rqst *req,
encode_nops(&hdr);
}
+/*
+ * Encode LAYOUTERROR request
+ */
+static void nfs4_xdr_enc_layouterror(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_layouterror_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+ int i;
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, NFS_FH(args->inode), &hdr);
+ for (i = 0; i < args->num_errors; i++)
+ encode_layouterror(xdr, &args->errors[i], &hdr);
+ encode_nops(&hdr);
+}
+
static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
{
return decode_op_hdr(xdr, OP_ALLOCATE);
@@ -394,7 +472,7 @@ static int decode_write_response(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
count = be32_to_cpup(p);
if (count > 1)
return -EREMOTEIO;
@@ -402,18 +480,14 @@ static int decode_write_response(struct xdr_stream *xdr,
status = decode_opaque_fixed(xdr, &res->stateid,
NFS4_STATEID_SIZE);
if (unlikely(status))
- goto out_overflow;
+ return -EIO;
}
p = xdr_inline_decode(xdr, 8 + 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
p = xdr_decode_hyper(p, &res->count);
res->verifier.committed = be32_to_cpup(p);
return decode_verifier(xdr, &res->verifier.verifier);
-
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_copy_requirements(struct xdr_stream *xdr,
@@ -422,14 +496,11 @@ static int decode_copy_requirements(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 4 + 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->consecutive = be32_to_cpup(p++);
res->synchronous = be32_to_cpup(p++);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_copy(struct xdr_stream *xdr, struct nfs42_copy_res *res)
@@ -474,15 +545,11 @@ static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res)
p = xdr_inline_decode(xdr, 4 + 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->sr_eof = be32_to_cpup(p++);
p = xdr_decode_hyper(p, &res->sr_offset);
return 0;
-
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_layoutstats(struct xdr_stream *xdr)
@@ -495,6 +562,11 @@ static int decode_clone(struct xdr_stream *xdr)
return decode_op_hdr(xdr, OP_CLONE);
}
+static int decode_layouterror(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_LAYOUTERROR);
+}
+
/*
* Decode ALLOCATE request
*/
@@ -704,4 +776,30 @@ out:
return status;
}
+/*
+ * Decode LAYOUTERROR request
+ */
+static int nfs4_xdr_dec_layouterror(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs42_layouterror_res *res = data;
+ struct compound_hdr hdr;
+ int status, i;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+
+ for (i = 0; i < res->num_errors && status == 0; i++)
+ status = decode_layouterror(xdr);
+out:
+ res->rpc_status = status;
+ return status;
+}
+
#endif /* __LINUX_FS_NFS_NFS4_2XDR_H */
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 2548405da1f7..1339ede979af 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -42,7 +42,7 @@ static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion)
}
#ifdef CONFIG_NFS_V4_1
-/**
+/*
* Per auth flavor data server rpc clients
*/
struct nfs4_ds_server {
@@ -51,7 +51,9 @@ struct nfs4_ds_server {
};
/**
- * Common lookup case for DS I/O
+ * nfs4_find_ds_client - Common lookup case for DS I/O
+ * @ds_clp: pointer to the DS's nfs_client
+ * @flavor: rpc auth flavour to match
*/
static struct nfs4_ds_server *
nfs4_find_ds_client(struct nfs_client *ds_clp, rpc_authflavor_t flavor)
@@ -118,9 +120,13 @@ nfs4_free_ds_server(struct nfs4_ds_server *dss)
}
/**
-* Find or create a DS rpc client with th MDS server rpc client auth flavor
-* in the nfs_client cl_ds_clients list.
-*/
+ * nfs4_find_or_create_ds_client - Find or create a DS rpc client
+ * @ds_clp: pointer to the DS's nfs_client
+ * @inode: pointer to the inode
+ *
+ * Find or create a DS rpc client with th MDS server rpc client auth flavor
+ * in the nfs_client cl_ds_clients list.
+ */
struct rpc_clnt *
nfs4_find_or_create_ds_client(struct nfs_client *ds_clp, struct inode *inode)
{
@@ -145,7 +151,6 @@ static void
nfs4_shutdown_ds_clients(struct nfs_client *clp)
{
struct nfs4_ds_server *dss;
- LIST_HEAD(shutdown_list);
while (!list_empty(&clp->cl_ds_clients)) {
dss = list_entry(clp->cl_ds_clients.next,
@@ -284,7 +289,7 @@ static int nfs4_init_callback(struct nfs_client *clp)
/**
* nfs40_init_client - nfs_client initialization tasks for NFSv4.0
- * @clp - nfs_client to initialize
+ * @clp: nfs_client to initialize
*
* Returns zero on success, or a negative errno if some error occurred.
*/
@@ -312,7 +317,7 @@ int nfs40_init_client(struct nfs_client *clp)
/**
* nfs41_init_client - nfs_client initialization tasks for NFSv4.1+
- * @clp - nfs_client to initialize
+ * @clp: nfs_client to initialize
*
* Returns zero on success, or a negative errno if some error occurred.
*/
@@ -360,9 +365,7 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
* nfs4_init_client - Initialise an NFS4 client record
*
* @clp: nfs_client to initialise
- * @timeparms: timeout parameters for underlying RPC transport
- * @ip_addr: callback IP address in presentation format
- * @authflavor: authentication flavor for underlying RPC transport
+ * @cl_init: pointer to nfs_client_initdata
*
* Returns pointer to an NFS client, or an ERR_PTR value.
*/
@@ -649,13 +652,13 @@ nfs4_check_server_scope(struct nfs41_server_scope *s1,
/**
* nfs4_detect_session_trunking - Checks for session trunking.
- *
- * Called after a successful EXCHANGE_ID on a multi-addr connection.
- * Upon success, add the transport.
- *
* @clp: original mount nfs_client
* @res: result structure from an exchange_id using the original mount
* nfs_client with a new multi_addr transport
+ * @xprt: pointer to the transport to add.
+ *
+ * Called after a successful EXCHANGE_ID on a multi-addr connection.
+ * Upon success, add the transport.
*
* Returns zero on success, otherwise -EINVAL
*
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 24f06dcc2b08..2e460c33ae48 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -137,6 +137,7 @@ static size_t nfs_parse_server_name(char *string, size_t len,
/**
* nfs_find_best_sec - Find a security mechanism supported locally
+ * @clnt: pointer to rpc_clnt
* @server: NFS server struct
* @flavors: List of security tuples returned by SECINFO procedure
*
@@ -288,8 +289,8 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
/**
* nfs_follow_referral - set up mountpoint when hitting a referral on moved error
- * @dentry - parent directory
- * @locations - array of NFSv4 server location information
+ * @dentry: parent directory
+ * @locations: array of NFSv4 server location information
*
*/
static struct vfsmount *nfs_follow_referral(struct dentry *dentry,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 557a5d636183..4dbb0ee23432 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -730,33 +730,41 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
res->sr_slot = NULL;
}
+static void nfs4_slot_sequence_record_sent(struct nfs4_slot *slot,
+ u32 seqnr)
+{
+ if ((s32)(seqnr - slot->seq_nr_highest_sent) > 0)
+ slot->seq_nr_highest_sent = seqnr;
+}
+static void nfs4_slot_sequence_acked(struct nfs4_slot *slot,
+ u32 seqnr)
+{
+ slot->seq_nr_highest_sent = seqnr;
+ slot->seq_nr_last_acked = seqnr;
+}
+
static int nfs41_sequence_process(struct rpc_task *task,
struct nfs4_sequence_res *res)
{
struct nfs4_session *session;
struct nfs4_slot *slot = res->sr_slot;
struct nfs_client *clp;
- bool interrupted = false;
int ret = 1;
if (slot == NULL)
goto out_noaction;
/* don't increment the sequence number if the task wasn't sent */
- if (!RPC_WAS_SENT(task))
+ if (!RPC_WAS_SENT(task) || slot->seq_done)
goto out;
session = slot->table->session;
- if (slot->interrupted) {
- if (res->sr_status != -NFS4ERR_DELAY)
- slot->interrupted = 0;
- interrupted = true;
- }
-
trace_nfs4_sequence_done(session, res);
/* Check the SEQUENCE operation status */
switch (res->sr_status) {
case 0:
+ /* Mark this sequence number as having been acked */
+ nfs4_slot_sequence_acked(slot, slot->seq_nr);
/* Update the slot's sequence and clientid lease timer */
slot->seq_done = 1;
clp = session->clp;
@@ -771,9 +779,9 @@ static int nfs41_sequence_process(struct rpc_task *task,
* sr_status remains 1 if an RPC level error occurred.
* The server may or may not have processed the sequence
* operation..
- * Mark the slot as having hosted an interrupted RPC call.
*/
- slot->interrupted = 1;
+ nfs4_slot_sequence_record_sent(slot, slot->seq_nr);
+ slot->seq_done = 1;
goto out;
case -NFS4ERR_DELAY:
/* The server detected a resend of the RPC call and
@@ -784,6 +792,7 @@ static int nfs41_sequence_process(struct rpc_task *task,
__func__,
slot->slot_nr,
slot->seq_nr);
+ nfs4_slot_sequence_acked(slot, slot->seq_nr);
goto out_retry;
case -NFS4ERR_RETRY_UNCACHED_REP:
case -NFS4ERR_SEQ_FALSE_RETRY:
@@ -791,6 +800,7 @@ static int nfs41_sequence_process(struct rpc_task *task,
* The server thinks we tried to replay a request.
* Retry the call after bumping the sequence ID.
*/
+ nfs4_slot_sequence_acked(slot, slot->seq_nr);
goto retry_new_seq;
case -NFS4ERR_BADSLOT:
/*
@@ -801,21 +811,28 @@ static int nfs41_sequence_process(struct rpc_task *task,
goto session_recover;
goto retry_nowait;
case -NFS4ERR_SEQ_MISORDERED:
+ nfs4_slot_sequence_record_sent(slot, slot->seq_nr);
/*
- * Was the last operation on this sequence interrupted?
- * If so, retry after bumping the sequence number.
- */
- if (interrupted)
- goto retry_new_seq;
- /*
- * Could this slot have been previously retired?
- * If so, then the server may be expecting seq_nr = 1!
+ * Were one or more calls using this slot interrupted?
+ * If the server never received the request, then our
+ * transmitted slot sequence number may be too high.
*/
- if (slot->seq_nr != 1) {
- slot->seq_nr = 1;
+ if ((s32)(slot->seq_nr - slot->seq_nr_last_acked) > 1) {
+ slot->seq_nr--;
goto retry_nowait;
}
- goto session_recover;
+ /*
+ * RFC5661:
+ * A retry might be sent while the original request is
+ * still in progress on the replier. The replier SHOULD
+ * deal with the issue by returning NFS4ERR_DELAY as the
+ * reply to SEQUENCE or CB_SEQUENCE operation, but
+ * implementations MAY return NFS4ERR_SEQ_MISORDERED.
+ *
+ * Restart the search after a delay.
+ */
+ slot->seq_nr = slot->seq_nr_highest_sent;
+ goto out_retry;
default:
/* Just update the slot sequence no. */
slot->seq_done = 1;
@@ -906,17 +923,6 @@ static const struct rpc_call_ops nfs41_call_sync_ops = {
.rpc_call_done = nfs41_call_sync_done,
};
-static void
-nfs4_sequence_process_interrupted(struct nfs_client *client,
- struct nfs4_slot *slot, const struct cred *cred)
-{
- struct rpc_task *task;
-
- task = _nfs41_proc_sequence(client, cred, slot, true);
- if (!IS_ERR(task))
- rpc_put_task_async(task);
-}
-
#else /* !CONFIG_NFS_V4_1 */
static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res)
@@ -937,16 +943,15 @@ int nfs4_sequence_done(struct rpc_task *task,
}
EXPORT_SYMBOL_GPL(nfs4_sequence_done);
-static void
-nfs4_sequence_process_interrupted(struct nfs_client *client,
- struct nfs4_slot *slot, const struct cred *cred)
+#endif /* !CONFIG_NFS_V4_1 */
+
+static void nfs41_sequence_res_init(struct nfs4_sequence_res *res)
{
- WARN_ON_ONCE(1);
- slot->interrupted = 0;
+ res->sr_timestamp = jiffies;
+ res->sr_status_flags = 0;
+ res->sr_status = 1;
}
-#endif /* !CONFIG_NFS_V4_1 */
-
static
void nfs4_sequence_attach_slot(struct nfs4_sequence_args *args,
struct nfs4_sequence_res *res,
@@ -958,10 +963,6 @@ void nfs4_sequence_attach_slot(struct nfs4_sequence_args *args,
args->sa_slot = slot;
res->sr_slot = slot;
- res->sr_timestamp = jiffies;
- res->sr_status_flags = 0;
- res->sr_status = 1;
-
}
int nfs4_setup_sequence(struct nfs_client *client,
@@ -982,31 +983,25 @@ int nfs4_setup_sequence(struct nfs_client *client,
task->tk_timeout = 0;
}
- for (;;) {
- spin_lock(&tbl->slot_tbl_lock);
- /* The state manager will wait until the slot table is empty */
- if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged)
- goto out_sleep;
-
- slot = nfs4_alloc_slot(tbl);
- if (IS_ERR(slot)) {
- /* Try again in 1/4 second */
- if (slot == ERR_PTR(-ENOMEM))
- task->tk_timeout = HZ >> 2;
- goto out_sleep;
- }
- spin_unlock(&tbl->slot_tbl_lock);
+ spin_lock(&tbl->slot_tbl_lock);
+ /* The state manager will wait until the slot table is empty */
+ if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged)
+ goto out_sleep;
- if (likely(!slot->interrupted))
- break;
- nfs4_sequence_process_interrupted(client,
- slot, task->tk_msg.rpc_cred);
+ slot = nfs4_alloc_slot(tbl);
+ if (IS_ERR(slot)) {
+ /* Try again in 1/4 second */
+ if (slot == ERR_PTR(-ENOMEM))
+ task->tk_timeout = HZ >> 2;
+ goto out_sleep;
}
+ spin_unlock(&tbl->slot_tbl_lock);
nfs4_sequence_attach_slot(args, res, slot);
trace_nfs4_setup_sequence(session, args);
out_start:
+ nfs41_sequence_res_init(res);
rpc_call_start(task);
return 0;
@@ -1555,6 +1550,10 @@ static void nfs_clear_open_stateid(struct nfs4_state *state,
static void nfs_set_open_stateid_locked(struct nfs4_state *state,
const nfs4_stateid *stateid, nfs4_stateid *freeme)
+ __must_hold(&state->owner->so_lock)
+ __must_hold(&state->seqlock)
+ __must_hold(RCU)
+
{
DEFINE_WAIT(wait);
int status = 0;
@@ -5963,7 +5962,7 @@ out:
/**
* nfs4_proc_setclientid_confirm - Confirm client ID
* @clp: state data structure
- * @res: result of a previous SETCLIENTID
+ * @arg: result of a previous SETCLIENTID
* @cred: credential to use for this call
*
* Returns zero, a negative errno, or a negative NFS4ERR status code.
@@ -7527,7 +7526,7 @@ int nfs4_proc_fsid_present(struct inode *inode, const struct cred *cred)
return status;
}
-/**
+/*
* If 'use_integrity' is true and the state managment nfs_client
* cl_rpcclient is using krb5i/p, use the integrity protected cl_rpcclient
* and the machine credential as per RFC3530bis and RFC5661 Security
@@ -8937,10 +8936,12 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout)
if (status != 0)
goto out;
- /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
- if (task->tk_status < 0 || lgp->res.layoutp->len == 0) {
+ if (task->tk_status < 0) {
status = nfs4_layoutget_handle_exception(task, lgp, &exception);
*timeout = exception.timeout;
+ } else if (lgp->res.layoutp->len == 0) {
+ status = -EAGAIN;
+ *timeout = nfs4_update_delay(&exception.timeout);
} else
lseg = pnfs_layout_process(lgp);
out:
@@ -9219,7 +9220,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
return status;
}
-/**
+/*
* Use the state managment nfs_client cl_rpcclient, which uses krb5i (if
* possible) as per RFC3530bis and RFC5661 Security Considerations sections
*/
@@ -9484,7 +9485,7 @@ static const struct rpc_call_ops nfs41_free_stateid_ops = {
* @server: server / transport on which to perform the operation
* @stateid: state ID to release
* @cred: credential
- * @is_recovery: set to true if this call needs to be privileged
+ * @privileged: set to true if this call needs to be privileged
*
* Note: this function is always asynchronous.
*/
@@ -9691,7 +9692,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
| NFS_CAP_DEALLOCATE
| NFS_CAP_SEEK
| NFS_CAP_LAYOUTSTATS
- | NFS_CAP_CLONE,
+ | NFS_CAP_CLONE
+ | NFS_CAP_LAYOUTERROR,
.init_client = nfs41_init_client,
.shutdown_client = nfs41_shutdown_client,
.match_stateid = nfs41_match_stateid,
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index a5489d70a724..bcb532def9e2 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -55,7 +55,7 @@ static void nfs4_shrink_slot_table(struct nfs4_slot_table *tbl, u32 newsize)
/**
* nfs4_slot_tbl_drain_complete - wake waiters when drain is complete
- * @tbl - controlling slot table
+ * @tbl: controlling slot table
*
*/
void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl)
@@ -110,6 +110,8 @@ static struct nfs4_slot *nfs4_new_slot(struct nfs4_slot_table *tbl,
slot->table = tbl;
slot->slot_nr = slotid;
slot->seq_nr = seq_init;
+ slot->seq_nr_highest_sent = seq_init;
+ slot->seq_nr_last_acked = seq_init - 1;
}
return slot;
}
@@ -276,7 +278,8 @@ static void nfs4_reset_slot_table(struct nfs4_slot_table *tbl,
p = &tbl->slots;
while (*p) {
(*p)->seq_nr = ivalue;
- (*p)->interrupted = 0;
+ (*p)->seq_nr_highest_sent = ivalue;
+ (*p)->seq_nr_last_acked = ivalue - 1;
p = &(*p)->next;
}
tbl->highest_used_slotid = NFS4_NO_SLOT;
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index 3c550f297561..b996ee23f1ba 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -10,7 +10,7 @@
/* maximum number of slots to use */
#define NFS4_DEF_SLOT_TABLE_SIZE (64U)
-#define NFS4_DEF_CB_SLOT_TABLE_SIZE (1U)
+#define NFS4_DEF_CB_SLOT_TABLE_SIZE (16U)
#define NFS4_MAX_SLOT_TABLE (1024U)
#define NFS4_NO_SLOT ((u32)-1)
@@ -23,8 +23,9 @@ struct nfs4_slot {
unsigned long generation;
u32 slot_nr;
u32 seq_nr;
- unsigned int interrupted : 1,
- privileged : 1,
+ u32 seq_nr_last_acked;
+ u32 seq_nr_highest_sent;
+ unsigned int privileged : 1,
seq_done : 1;
};
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 02488b50534a..3de36479ed7a 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -563,6 +563,7 @@ static void nfs4_gc_state_owners(struct nfs_server *server)
* nfs4_get_state_owner - Look up a state owner given a credential
* @server: nfs_server to search
* @cred: RPC credential to match
+ * @gfp_flags: allocation mode
*
* Returns a pointer to an instantiated nfs4_state_owner struct, or NULL.
*/
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index b4557cf685fb..cd1a5c08da9a 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -524,6 +524,31 @@ TRACE_EVENT(nfs4_setup_sequence,
)
);
+TRACE_EVENT(nfs4_xdr_status,
+ TP_PROTO(
+ u32 op,
+ int error
+ ),
+
+ TP_ARGS(op, error),
+
+ TP_STRUCT__entry(
+ __field(u32, op)
+ __field(int, error)
+ ),
+
+ TP_fast_assign(
+ __entry->op = op;
+ __entry->error = -error;
+ ),
+
+ TP_printk(
+ "operation %d: nfs status %d (%s)",
+ __entry->op,
+ __entry->error, show_nfsv4_errors(__entry->error)
+ )
+);
+
DECLARE_EVENT_CLASS(nfs4_open_event,
TP_PROTO(
const struct nfs_open_context *ctx,
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 2fc8f6fa25e4..cfcabc33e24d 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -54,6 +54,7 @@
#include <linux/nfs_fs.h>
#include "nfs4_fs.h"
+#include "nfs4trace.h"
#include "internal.h"
#include "nfs4idmap.h"
#include "nfs4session.h"
@@ -214,14 +215,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
nfs4_fattr_bitmap_maxsz)
#define encode_read_maxsz (op_encode_hdr_maxsz + \
encode_stateid_maxsz + 3)
-#define decode_read_maxsz (op_decode_hdr_maxsz + 2)
+#define decode_read_maxsz (op_decode_hdr_maxsz + 2 + 1)
#define encode_readdir_maxsz (op_encode_hdr_maxsz + \
2 + encode_verifier_maxsz + 5 + \
nfs4_label_maxsz)
#define decode_readdir_maxsz (op_decode_hdr_maxsz + \
- decode_verifier_maxsz)
+ decode_verifier_maxsz + 1)
#define encode_readlink_maxsz (op_encode_hdr_maxsz)
-#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1)
+#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1 + 1)
#define encode_write_maxsz (op_encode_hdr_maxsz + \
encode_stateid_maxsz + 4)
#define decode_write_maxsz (op_decode_hdr_maxsz + \
@@ -283,14 +284,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
#define decode_delegreturn_maxsz (op_decode_hdr_maxsz)
#define encode_getacl_maxsz (encode_getattr_maxsz)
#define decode_getacl_maxsz (op_decode_hdr_maxsz + \
- nfs4_fattr_bitmap_maxsz + 1)
+ nfs4_fattr_bitmap_maxsz + 1 + 1)
#define encode_setacl_maxsz (op_encode_hdr_maxsz + \
encode_stateid_maxsz + 3)
#define decode_setacl_maxsz (decode_setattr_maxsz)
#define encode_fs_locations_maxsz \
(encode_getattr_maxsz)
#define decode_fs_locations_maxsz \
- (0)
+ (1)
#define encode_secinfo_maxsz (op_encode_hdr_maxsz + nfs4_name_maxsz)
#define decode_secinfo_maxsz (op_decode_hdr_maxsz + 1 + ((NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN)) / 4))
@@ -391,12 +392,13 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
1 /* opaque devaddr4 length */ + \
/* devaddr4 payload is read into page */ \
1 /* notification bitmap length */ + \
- 1 /* notification bitmap, word 0 */)
+ 1 /* notification bitmap, word 0 */ + \
+ 1 /* possible XDR padding */)
#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \
encode_stateid_maxsz)
#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \
decode_stateid_maxsz + \
- XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE))
+ XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + 1)
#define encode_layoutcommit_maxsz (op_encode_hdr_maxsz + \
2 /* offset */ + \
2 /* length */ + \
@@ -1015,12 +1017,11 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
struct compound_hdr *hdr)
{
__be32 *p;
- struct rpc_auth *auth = req->rq_cred->cr_auth;
/* initialize running count of expected bytes in reply.
* NOTE: the replied tag SHOULD be the same is the one sent,
* but this is not required as a MUST for the server to do so. */
- hdr->replen = RPC_REPHDRSIZE + auth->au_rslack + 3 + hdr->taglen;
+ hdr->replen = 3 + hdr->taglen;
WARN_ON_ONCE(hdr->taglen > NFS4_MAXTAGLEN);
encode_string(xdr, hdr->taglen, hdr->tag);
@@ -2340,9 +2341,9 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr);
if (args->lg_args) {
encode_layoutget(xdr, args->lg_args, &hdr);
- xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
- args->lg_args->layout.pages,
- 0, args->lg_args->layout.pglen);
+ rpc_prepare_reply_pages(req, args->lg_args->layout.pages, 0,
+ args->lg_args->layout.pglen,
+ hdr.replen);
}
encode_nops(&hdr);
}
@@ -2386,9 +2387,9 @@ static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req,
encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr);
if (args->lg_args) {
encode_layoutget(xdr, args->lg_args, &hdr);
- xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
- args->lg_args->layout.pages,
- 0, args->lg_args->layout.pglen);
+ rpc_prepare_reply_pages(req, args->lg_args->layout.pages, 0,
+ args->lg_args->layout.pglen,
+ hdr.replen);
}
encode_nops(&hdr);
}
@@ -2498,8 +2499,8 @@ static void nfs4_xdr_enc_readlink(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_putfh(xdr, args->fh, &hdr);
encode_readlink(xdr, args, req, &hdr);
- xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
- args->pgbase, args->pglen);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->pglen, hdr.replen);
encode_nops(&hdr);
}
@@ -2519,11 +2520,8 @@ static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_putfh(xdr, args->fh, &hdr);
encode_readdir(xdr, args, req, &hdr);
- xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
- args->pgbase, args->count);
- dprintk("%s: inlined page args = (%u, %p, %u, %u)\n",
- __func__, hdr.replen << 2, args->pages,
- args->pgbase, args->count);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->count, hdr.replen);
encode_nops(&hdr);
}
@@ -2543,8 +2541,8 @@ static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_putfh(xdr, args->fh, &hdr);
encode_read(xdr, args, &hdr);
- xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
- args->pages, args->pgbase, args->count);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->count, hdr.replen);
req->rq_rcv_buf.flags |= XDRBUF_READ;
encode_nops(&hdr);
}
@@ -2590,9 +2588,8 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_getattr(xdr, nfs4_acl_bitmap, NULL,
ARRAY_SIZE(nfs4_acl_bitmap), &hdr);
- xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
- args->acl_pages, 0, args->acl_len);
-
+ rpc_prepare_reply_pages(req, args->acl_pages, 0,
+ args->acl_len, replen);
encode_nops(&hdr);
}
@@ -2813,9 +2810,8 @@ static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req,
encode_fs_locations(xdr, args->bitmask, &hdr);
}
- /* Set up reply kvec to capture returned fs_locations array. */
- xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
- (struct page **)&args->page, 0, PAGE_SIZE);
+ rpc_prepare_reply_pages(req, (struct page **)&args->page, 0,
+ PAGE_SIZE, replen);
encode_nops(&hdr);
}
@@ -3017,10 +3013,8 @@ static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
/* set up reply kvec. Subtract notification bitmap max size (2)
* so that notification bitmap is put in xdr_buf tail */
- xdr_inline_pages(&req->rq_rcv_buf, (hdr.replen - 2) << 2,
- args->pdev->pages, args->pdev->pgbase,
- args->pdev->pglen);
-
+ rpc_prepare_reply_pages(req, args->pdev->pages, args->pdev->pgbase,
+ args->pdev->pglen, hdr.replen - 2);
encode_nops(&hdr);
}
@@ -3041,9 +3035,8 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
encode_putfh(xdr, NFS_FH(args->inode), &hdr);
encode_layoutget(xdr, args, &hdr);
- xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
- args->layout.pages, 0, args->layout.pglen);
-
+ rpc_prepare_reply_pages(req, args->layout.pages, 0,
+ args->layout.pglen, hdr.replen);
encode_nops(&hdr);
}
@@ -3144,22 +3137,12 @@ static void nfs4_xdr_enc_free_stateid(struct rpc_rqst *req,
}
#endif /* CONFIG_NFS_V4_1 */
-static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-{
- dprintk("nfs: %s: prematurely hit end of receive buffer. "
- "Remaining buffer length is %tu words.\n",
- func, xdr->end - xdr->p);
-}
-
static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string)
{
ssize_t ret = xdr_stream_decode_opaque_inline(xdr, (void **)string,
NFS4_OPAQUE_LIMIT);
- if (unlikely(ret < 0)) {
- if (ret == -EBADMSG)
- print_overflow_msg(__func__, xdr);
+ if (unlikely(ret < 0))
return -EIO;
- }
*len = ret;
return 0;
}
@@ -3170,22 +3153,19 @@ static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
hdr->status = be32_to_cpup(p++);
hdr->taglen = be32_to_cpup(p);
p = xdr_inline_decode(xdr, hdr->taglen + 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
hdr->tag = (char *)p;
p += XDR_QUADLEN(hdr->taglen);
hdr->nops = be32_to_cpup(p);
if (unlikely(hdr->nops < 1))
return nfs4_stat_to_errno(hdr->status);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static bool __decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected,
@@ -3201,11 +3181,14 @@ static bool __decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected,
opnum = be32_to_cpup(p++);
if (unlikely(opnum != expected))
goto out_bad_operation;
+ if (unlikely(*p != cpu_to_be32(NFS_OK)))
+ goto out_status;
+ *nfs_retval = 0;
+ return true;
+out_status:
nfserr = be32_to_cpup(p);
- if (nfserr == NFS_OK)
- *nfs_retval = 0;
- else
- *nfs_retval = nfs4_stat_to_errno(nfserr);
+ trace_nfs4_xdr_status(opnum, nfserr);
+ *nfs_retval = nfs4_stat_to_errno(nfserr);
return true;
out_bad_operation:
dprintk("nfs: Server returned operation"
@@ -3214,7 +3197,6 @@ out_bad_operation:
*nfs_retval = -EREMOTEIO;
return false;
out_overflow:
- print_overflow_msg(__func__, xdr);
*nfs_retval = -EIO;
return false;
}
@@ -3235,10 +3217,9 @@ static int decode_ace(struct xdr_stream *xdr, void *ace)
char *str;
p = xdr_inline_decode(xdr, 12);
- if (likely(p))
- return decode_opaque_inline(xdr, &strlen, &str);
- print_overflow_msg(__func__, xdr);
- return -EIO;
+ if (unlikely(!p))
+ return -EIO;
+ return decode_opaque_inline(xdr, &strlen, &str);
}
static ssize_t
@@ -3249,10 +3230,9 @@ decode_bitmap4(struct xdr_stream *xdr, uint32_t *bitmap, size_t sz)
ret = xdr_stream_decode_uint32_array(xdr, bitmap, sz);
if (likely(ret >= 0))
return ret;
- if (ret == -EMSGSIZE)
- return sz;
- print_overflow_msg(__func__, xdr);
- return -EIO;
+ if (ret != -EMSGSIZE)
+ return -EIO;
+ return sz;
}
static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
@@ -3268,13 +3248,10 @@ static int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, unsigne
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
*attrlen = be32_to_cpup(p);
*savep = xdr_stream_pos(xdr);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask)
@@ -3303,7 +3280,7 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
*type = be32_to_cpup(p);
if (*type < NF4REG || *type > NF4NAMEDATTR) {
dprintk("%s: bad type %d\n", __func__, *type);
@@ -3314,9 +3291,6 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
}
dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]);
return ret;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_fh_expire_type(struct xdr_stream *xdr,
@@ -3330,15 +3304,12 @@ static int decode_attr_fh_expire_type(struct xdr_stream *xdr,
if (likely(bitmap[0] & FATTR4_WORD0_FH_EXPIRE_TYPE)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
*type = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_FH_EXPIRE_TYPE;
}
dprintk("%s: expire type=0x%x\n", __func__, *type);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
@@ -3352,7 +3323,7 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, change);
bitmap[0] &= ~FATTR4_WORD0_CHANGE;
ret = NFS_ATTR_FATTR_CHANGE;
@@ -3360,9 +3331,6 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
dprintk("%s: change attribute=%Lu\n", __func__,
(unsigned long long)*change);
return ret;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size)
@@ -3376,16 +3344,13 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, size);
bitmap[0] &= ~FATTR4_WORD0_SIZE;
ret = NFS_ATTR_FATTR_SIZE;
}
dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size);
return ret;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3398,15 +3363,12 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui
if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
*res = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT;
}
dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true");
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3419,15 +3381,12 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
*res = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT;
}
dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true");
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
@@ -3442,7 +3401,7 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
if (likely(bitmap[0] & FATTR4_WORD0_FSID)) {
p = xdr_inline_decode(xdr, 16);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
p = xdr_decode_hyper(p, &fsid->major);
xdr_decode_hyper(p, &fsid->minor);
bitmap[0] &= ~FATTR4_WORD0_FSID;
@@ -3452,9 +3411,6 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
(unsigned long long)fsid->major,
(unsigned long long)fsid->minor);
return ret;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3467,15 +3423,12 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint
if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
*res = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME;
}
dprintk("%s: file size=%u\n", __func__, (unsigned int)*res);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap, int32_t *res)
@@ -3487,14 +3440,11 @@ static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap, int32_t *
if (likely(bitmap[0] & FATTR4_WORD0_RDATTR_ERROR)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR;
*res = -be32_to_cpup(p);
}
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_exclcreat_supported(struct xdr_stream *xdr,
@@ -3526,13 +3476,13 @@ static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, stru
if (likely(bitmap[0] & FATTR4_WORD0_FILEHANDLE)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
len = be32_to_cpup(p);
if (len > NFS4_FHSIZE)
return -EIO;
p = xdr_inline_decode(xdr, len);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
if (fh != NULL) {
memcpy(fh->data, p, len);
fh->size = len;
@@ -3540,9 +3490,6 @@ static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, stru
bitmap[0] &= ~FATTR4_WORD0_FILEHANDLE;
}
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3555,15 +3502,12 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
*res = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT;
}
dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
@@ -3577,16 +3521,13 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, fileid);
bitmap[0] &= ~FATTR4_WORD0_FILEID;
ret = NFS_ATTR_FATTR_FILEID;
}
dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
return ret;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
@@ -3600,16 +3541,13 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, fileid);
bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
ret = NFS_ATTR_FATTR_MOUNTED_ON_FILEID;
}
dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
return ret;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3623,15 +3561,12 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, res);
bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL;
}
dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3645,15 +3580,12 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, res);
bitmap[0] &= ~FATTR4_WORD0_FILES_FREE;
}
dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3667,15 +3599,12 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, res);
bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL;
}
dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
@@ -3686,7 +3615,7 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
n = be32_to_cpup(p);
if (n == 0)
goto root_path;
@@ -3718,9 +3647,6 @@ out_eio:
dprintk(" status %d", status);
status = -EIO;
goto out;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res)
@@ -3745,7 +3671,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
goto out;
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ goto out_eio;
n = be32_to_cpup(p);
if (n <= 0)
goto out_eio;
@@ -3758,7 +3684,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
loc = &res->locations[res->nlocations];
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ goto out_eio;
m = be32_to_cpup(p);
dprintk("%s: servers:\n", __func__);
@@ -3796,8 +3722,6 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
out:
dprintk("%s: fs_locations done, error = %d\n", __func__, status);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
out_eio:
status = -EIO;
goto out;
@@ -3814,15 +3738,12 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin
if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, res);
bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE;
}
dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink)
@@ -3836,15 +3757,12 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
*maxlink = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_MAXLINK;
}
dprintk("%s: maxlink=%u\n", __func__, *maxlink);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname)
@@ -3858,15 +3776,12 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
*maxname = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_MAXNAME;
}
dprintk("%s: maxname=%u\n", __func__, *maxname);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3881,7 +3796,7 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
uint64_t maxread;
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, &maxread);
if (maxread > 0x7FFFFFFF)
maxread = 0x7FFFFFFF;
@@ -3890,9 +3805,6 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
}
dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3907,7 +3819,7 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
uint64_t maxwrite;
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, &maxwrite);
if (maxwrite > 0x7FFFFFFF)
maxwrite = 0x7FFFFFFF;
@@ -3916,9 +3828,6 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
}
dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode)
@@ -3933,7 +3842,7 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m
if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
tmp = be32_to_cpup(p);
*mode = tmp & ~S_IFMT;
bitmap[1] &= ~FATTR4_WORD1_MODE;
@@ -3941,9 +3850,6 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m
}
dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
return ret;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink)
@@ -3957,16 +3863,13 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
*nlink = be32_to_cpup(p);
bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
ret = NFS_ATTR_FATTR_NLINK;
}
dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink);
return ret;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static ssize_t decode_nfs4_string(struct xdr_stream *xdr,
@@ -4011,10 +3914,9 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
return NFS_ATTR_FATTR_OWNER;
}
out:
- if (len != -EBADMSG)
- return 0;
- print_overflow_msg(__func__, xdr);
- return -EIO;
+ if (len == -EBADMSG)
+ return -EIO;
+ return 0;
}
static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
@@ -4046,10 +3948,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
return NFS_ATTR_FATTR_GROUP;
}
out:
- if (len != -EBADMSG)
- return 0;
- print_overflow_msg(__func__, xdr);
- return -EIO;
+ if (len == -EBADMSG)
+ return -EIO;
+ return 0;
}
static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev)
@@ -4066,7 +3967,7 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
major = be32_to_cpup(p++);
minor = be32_to_cpup(p);
tmp = MKDEV(major, minor);
@@ -4077,9 +3978,6 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
}
dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor);
return ret;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -4093,15 +3991,12 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, res);
bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL;
}
dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -4115,15 +4010,12 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, res);
bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE;
}
dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -4137,15 +4029,12 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, res);
bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL;
}
dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used)
@@ -4159,7 +4048,7 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, used);
bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
ret = NFS_ATTR_FATTR_SPACE_USED;
@@ -4167,9 +4056,6 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
dprintk("%s: space used=%Lu\n", __func__,
(unsigned long long)*used);
return ret;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static __be32 *
@@ -4189,12 +4075,9 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
p = xdr_inline_decode(xdr, nfstime4_maxsz << 2);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_nfstime4(p, time);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
@@ -4265,19 +4148,19 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap,
if (likely(bitmap[2] & FATTR4_WORD2_SECURITY_LABEL)) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
lfs = be32_to_cpup(p++);
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
pi = be32_to_cpup(p++);
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
len = be32_to_cpup(p++);
p = xdr_inline_decode(xdr, len);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
if (len < NFS4_MAXLABELLEN) {
if (label) {
memcpy(label->label, p, len);
@@ -4295,10 +4178,6 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap,
dprintk("%s: label=%s, len=%d, PI=%d, LFS=%d\n", __func__,
(char *)label->label, label->len, label->pi, label->lfs);
return status;
-
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
@@ -4342,14 +4221,11 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c
p = xdr_inline_decode(xdr, 20);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
cinfo->atomic = be32_to_cpup(p++);
p = xdr_decode_hyper(p, &cinfo->before);
xdr_decode_hyper(p, &cinfo->after);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_access(struct xdr_stream *xdr, u32 *supported, u32 *access)
@@ -4363,24 +4239,19 @@ static int decode_access(struct xdr_stream *xdr, u32 *supported, u32 *access)
return status;
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
supp = be32_to_cpup(p++);
acc = be32_to_cpup(p);
*supported = supp;
*access = acc;
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len)
{
ssize_t ret = xdr_stream_decode_opaque_fixed(xdr, buf, len);
- if (unlikely(ret < 0)) {
- print_overflow_msg(__func__, xdr);
+ if (unlikely(ret < 0))
return -EIO;
- }
return 0;
}
@@ -4460,13 +4331,11 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
return status;
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
bmlen = be32_to_cpup(p);
p = xdr_inline_decode(xdr, bmlen << 2);
if (likely(p))
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
return -EIO;
}
@@ -4574,13 +4443,10 @@ static int decode_threshold_hint(struct xdr_stream *xdr,
if (likely(bitmap[0] & hint_bit)) {
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, res);
}
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_first_threshold_item4(struct xdr_stream *xdr,
@@ -4593,10 +4459,8 @@ static int decode_first_threshold_item4(struct xdr_stream *xdr,
/* layout type */
p = xdr_inline_decode(xdr, 4);
- if (unlikely(!p)) {
- print_overflow_msg(__func__, xdr);
+ if (unlikely(!p))
return -EIO;
- }
res->l_type = be32_to_cpup(p);
/* thi_hintset bitmap */
@@ -4654,7 +4518,7 @@ static int decode_attr_mdsthreshold(struct xdr_stream *xdr,
return -EREMOTEIO;
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
num = be32_to_cpup(p);
if (num == 0)
return 0;
@@ -4667,9 +4531,6 @@ static int decode_attr_mdsthreshold(struct xdr_stream *xdr,
bitmap[2] &= ~FATTR4_WORD2_MDSTHRESHOLD;
}
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
@@ -4857,7 +4718,7 @@ static int decode_pnfs_layout_types(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
fsinfo->nlayouttypes = be32_to_cpup(p);
/* pNFS is not supported by the underlying file system */
@@ -4867,7 +4728,7 @@ static int decode_pnfs_layout_types(struct xdr_stream *xdr,
/* Decode and set first layout type, move xdr->p past unused types */
p = xdr_inline_decode(xdr, fsinfo->nlayouttypes * 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
/* If we get too many, then just cap it at the max */
if (fsinfo->nlayouttypes > NFS_MAX_LAYOUT_TYPES) {
@@ -4879,9 +4740,6 @@ static int decode_pnfs_layout_types(struct xdr_stream *xdr,
for(i = 0; i < fsinfo->nlayouttypes; ++i)
fsinfo->layouttype[i] = be32_to_cpup(p++);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
/*
@@ -4915,10 +4773,8 @@ static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
*res = 0;
if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) {
p = xdr_inline_decode(xdr, 4);
- if (unlikely(!p)) {
- print_overflow_msg(__func__, xdr);
+ if (unlikely(!p))
return -EIO;
- }
*res = be32_to_cpup(p);
bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE;
}
@@ -4937,10 +4793,8 @@ static int decode_attr_clone_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
*res = 0;
if (bitmap[2] & FATTR4_WORD2_CLONE_BLKSIZE) {
p = xdr_inline_decode(xdr, 4);
- if (unlikely(!p)) {
- print_overflow_msg(__func__, xdr);
+ if (unlikely(!p))
return -EIO;
- }
*res = be32_to_cpup(p);
bitmap[2] &= ~FATTR4_WORD2_CLONE_BLKSIZE;
}
@@ -5016,19 +4870,16 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
len = be32_to_cpup(p);
if (len > NFS4_FHSIZE)
return -EIO;
fh->size = len;
p = xdr_inline_decode(xdr, len);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
memcpy(fh->data, p, len);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -5052,7 +4903,7 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
p = xdr_inline_decode(xdr, 32); /* read 32 bytes */
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
p = xdr_decode_hyper(p, &offset); /* read 2 8-byte long words */
p = xdr_decode_hyper(p, &length);
type = be32_to_cpup(p++); /* 4 byte read */
@@ -5069,11 +4920,9 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
p = xdr_decode_hyper(p, &clientid); /* read 8 bytes */
namelen = be32_to_cpup(p); /* read 4 bytes */ /* have read all 32 bytes now */
p = xdr_inline_decode(xdr, namelen); /* variable size field */
- if (likely(p))
- return -NFS4ERR_DENIED;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
+ if (likely(!p))
+ return -EIO;
+ return -NFS4ERR_DENIED;
}
static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
@@ -5142,7 +4991,7 @@ static int decode_space_limit(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 12);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
limit_type = be32_to_cpup(p++);
switch (limit_type) {
case NFS4_LIMIT_SIZE:
@@ -5156,9 +5005,6 @@ static int decode_space_limit(struct xdr_stream *xdr,
maxsize >>= PAGE_SHIFT;
*pagemod_limit = min_t(u64, maxsize, ULONG_MAX);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_rw_delegation(struct xdr_stream *xdr,
@@ -5173,7 +5019,7 @@ static int decode_rw_delegation(struct xdr_stream *xdr,
return status;
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->do_recall = be32_to_cpup(p);
switch (delegation_type) {
@@ -5186,9 +5032,6 @@ static int decode_rw_delegation(struct xdr_stream *xdr,
return -EIO;
}
return decode_ace(xdr, NULL);
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
@@ -5198,7 +5041,7 @@ static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
why_no_delegation = be32_to_cpup(p);
switch (why_no_delegation) {
case WND4_CONTENTION:
@@ -5207,9 +5050,6 @@ static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
/* Ignore for now */
}
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
@@ -5219,7 +5059,7 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
delegation_type = be32_to_cpup(p);
res->delegation_type = 0;
switch (delegation_type) {
@@ -5232,9 +5072,6 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
return decode_no_delegation(xdr, res);
}
return -EIO;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
@@ -5256,7 +5093,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->rflags = be32_to_cpup(p++);
bmlen = be32_to_cpup(p);
if (bmlen > 10)
@@ -5264,7 +5101,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
p = xdr_inline_decode(xdr, bmlen << 2);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE);
for (i = 0; i < savewords; ++i)
res->attrset[i] = be32_to_cpup(p++);
@@ -5275,9 +5112,6 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
xdr_error:
dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen);
return -EIO;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res)
@@ -5326,7 +5160,7 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req,
return status;
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
eof = be32_to_cpup(p++);
count = be32_to_cpup(p);
recvd = xdr_read_pages(xdr, count);
@@ -5339,9 +5173,6 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req,
res->eof = eof;
res->count = count;
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir)
@@ -5374,7 +5205,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
/* Convert length of symlink */
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
len = be32_to_cpup(p);
if (len >= rcvbuf->page_len || len <= 0) {
dprintk("nfs: server returned giant symlink!\n");
@@ -5395,9 +5226,6 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
*/
xdr_terminate_string(rcvbuf, len);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_remove(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -5500,7 +5328,6 @@ static int decode_setattr(struct xdr_stream *xdr)
return status;
if (decode_bitmap4(xdr, NULL, 0) >= 0)
return 0;
- print_overflow_msg(__func__, xdr);
return -EIO;
}
@@ -5512,7 +5339,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_re
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
opnum = be32_to_cpup(p++);
if (opnum != OP_SETCLIENTID) {
dprintk("nfs: decode_setclientid: Server returned operation"
@@ -5523,7 +5350,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_re
if (nfserr == NFS_OK) {
p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
p = xdr_decode_hyper(p, &res->clientid);
memcpy(res->confirm.data, p, NFS4_VERIFIER_SIZE);
} else if (nfserr == NFSERR_CLID_INUSE) {
@@ -5532,28 +5359,25 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_re
/* skip netid string */
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
len = be32_to_cpup(p);
p = xdr_inline_decode(xdr, len);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
/* skip uaddr string */
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
len = be32_to_cpup(p);
p = xdr_inline_decode(xdr, len);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
return -NFSERR_CLID_INUSE;
} else
return nfs4_stat_to_errno(nfserr);
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_setclientid_confirm(struct xdr_stream *xdr)
@@ -5572,13 +5396,10 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_pgio_res *res)
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->count = be32_to_cpup(p++);
res->verf->committed = be32_to_cpup(p++);
return decode_write_verifier(xdr, &res->verf->verifier);
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_delegreturn(struct xdr_stream *xdr)
@@ -5594,30 +5415,24 @@ static int decode_secinfo_gss(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
oid_len = be32_to_cpup(p);
if (oid_len > GSS_OID_MAX_LEN)
- goto out_err;
+ return -EINVAL;
p = xdr_inline_decode(xdr, oid_len);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
memcpy(flavor->flavor_info.oid.data, p, oid_len);
flavor->flavor_info.oid.len = oid_len;
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
flavor->flavor_info.qop = be32_to_cpup(p++);
flavor->flavor_info.service = be32_to_cpup(p);
return 0;
-
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
-out_err:
- return -EINVAL;
}
static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res *res)
@@ -5629,7 +5444,7 @@ static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->flavors->num_flavors = 0;
num_flavors = be32_to_cpup(p);
@@ -5641,7 +5456,7 @@ static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
sec_flavor->flavor = be32_to_cpup(p);
if (sec_flavor->flavor == RPC_AUTH_GSS) {
@@ -5655,9 +5470,6 @@ static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res
status = 0;
out:
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_secinfo(struct xdr_stream *xdr, struct nfs4_secinfo_res *res)
@@ -5711,11 +5523,11 @@ static int decode_exchange_id(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
xdr_decode_hyper(p, &res->clientid);
p = xdr_inline_decode(xdr, 12);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->seqid = be32_to_cpup(p++);
res->flags = be32_to_cpup(p++);
@@ -5739,7 +5551,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
/* server_owner4.so_minor_id */
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
p = xdr_decode_hyper(p, &res->server_owner->minor_id);
/* server_owner4.so_major_id */
@@ -5759,7 +5571,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
/* Implementation Id */
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
impl_id_count = be32_to_cpup(p++);
if (impl_id_count) {
@@ -5778,16 +5590,13 @@ static int decode_exchange_id(struct xdr_stream *xdr,
/* nii_date */
p = xdr_inline_decode(xdr, 12);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
p = xdr_decode_hyper(p, &res->impl_id->date.seconds);
res->impl_id->date.nseconds = be32_to_cpup(p);
/* if there's more than one entry, ignore the rest */
}
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_chan_attrs(struct xdr_stream *xdr,
@@ -5798,7 +5607,7 @@ static int decode_chan_attrs(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 28);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
val = be32_to_cpup(p++); /* headerpadsz */
if (val)
return -EINVAL; /* no support for header padding yet */
@@ -5816,12 +5625,9 @@ static int decode_chan_attrs(struct xdr_stream *xdr,
if (nr_attrs == 1) {
p = xdr_inline_decode(xdr, 4); /* skip rdma_attrs */
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
}
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid)
@@ -5844,7 +5650,7 @@ static int decode_bind_conn_to_session(struct xdr_stream *xdr,
/* dir flags, rdma mode bool */
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->dir = be32_to_cpup(p++);
if (res->dir == 0 || res->dir > NFS4_CDFS4_BOTH)
@@ -5855,9 +5661,6 @@ static int decode_bind_conn_to_session(struct xdr_stream *xdr,
res->use_conn_in_rdma_mode = true;
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_create_session(struct xdr_stream *xdr,
@@ -5875,7 +5678,7 @@ static int decode_create_session(struct xdr_stream *xdr,
/* seqid, flags */
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->seqid = be32_to_cpup(p++);
res->flags = be32_to_cpup(p);
@@ -5884,9 +5687,6 @@ static int decode_create_session(struct xdr_stream *xdr,
if (!status)
status = decode_chan_attrs(xdr, &res->bc_attrs);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_destroy_session(struct xdr_stream *xdr, void *dummy)
@@ -5967,7 +5767,6 @@ out_err:
res->sr_status = status;
return status;
out_overflow:
- print_overflow_msg(__func__, xdr);
status = -EIO;
goto out_err;
#else /* CONFIG_NFS_V4_1 */
@@ -5995,7 +5794,7 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
if (status == -ETOOSMALL) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
pdev->mincount = be32_to_cpup(p);
dprintk("%s: Min count too small. mincnt = %u\n",
__func__, pdev->mincount);
@@ -6005,7 +5804,7 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
type = be32_to_cpup(p++);
if (type != pdev->layout_type) {
dprintk("%s: layout mismatch req: %u pdev: %u\n",
@@ -6019,19 +5818,19 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
*/
pdev->mincount = be32_to_cpup(p);
if (xdr_read_pages(xdr, pdev->mincount) != pdev->mincount)
- goto out_overflow;
+ return -EIO;
/* Parse notification bitmap, verifying that it is zero. */
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
len = be32_to_cpup(p);
if (len) {
uint32_t i;
p = xdr_inline_decode(xdr, 4 * len);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->notification = be32_to_cpup(p++);
for (i = 1; i < len; i++) {
@@ -6043,9 +5842,6 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
}
}
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
@@ -6115,7 +5911,6 @@ out:
res->status = status;
return status;
out_overflow:
- print_overflow_msg(__func__, xdr);
status = -EIO;
goto out;
}
@@ -6131,16 +5926,13 @@ static int decode_layoutreturn(struct xdr_stream *xdr,
return status;
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->lrs_present = be32_to_cpup(p);
if (res->lrs_present)
status = decode_layout_stateid(xdr, &res->stateid);
else
nfs4_stateid_copy(&res->stateid, &invalid_stateid);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_layoutcommit(struct xdr_stream *xdr,
@@ -6158,19 +5950,16 @@ static int decode_layoutcommit(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
sizechanged = be32_to_cpup(p);
if (sizechanged) {
/* throw away new size */
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
}
return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
}
static int decode_test_stateid(struct xdr_stream *xdr,
@@ -6186,21 +5975,17 @@ static int decode_test_stateid(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
num_res = be32_to_cpup(p++);
if (num_res != 1)
- goto out;
+ return -EIO;
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EIO;
res->status = be32_to_cpup(p++);
return status;
-out_overflow:
- print_overflow_msg(__func__, xdr);
-out:
- return -EIO;
}
static int decode_free_stateid(struct xdr_stream *xdr,
@@ -7570,11 +7355,11 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
uint64_t new_cookie;
__be32 *p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EAGAIN;
if (*p == xdr_zero) {
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
- goto out_overflow;
+ return -EAGAIN;
if (*p == xdr_zero)
return -EAGAIN;
entry->eof = 1;
@@ -7583,13 +7368,13 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
p = xdr_inline_decode(xdr, 12);
if (unlikely(!p))
- goto out_overflow;
+ return -EAGAIN;
p = xdr_decode_hyper(p, &new_cookie);
entry->len = be32_to_cpup(p);
p = xdr_inline_decode(xdr, entry->len);
if (unlikely(!p))
- goto out_overflow;
+ return -EAGAIN;
entry->name = (const char *) p;
/*
@@ -7601,14 +7386,14 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
entry->fattr->valid = 0;
if (decode_attr_bitmap(xdr, bitmap) < 0)
- goto out_overflow;
+ return -EAGAIN;
if (decode_attr_length(xdr, &len, &savep) < 0)
- goto out_overflow;
+ return -EAGAIN;
if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
NULL, entry->label, entry->server) < 0)
- goto out_overflow;
+ return -EAGAIN;
if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
entry->ino = entry->fattr->mounted_on_fileid;
else if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
@@ -7622,10 +7407,6 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
entry->cookie = new_cookie;
return 0;
-
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EAGAIN;
}
/*
@@ -7791,6 +7572,7 @@ const struct rpc_procinfo nfs4_procedures[] = {
PROC42(COPY, enc_copy, dec_copy),
PROC42(OFFLOAD_CANCEL, enc_offload_cancel, dec_offload_cancel),
PROC(LOOKUPP, enc_lookupp, dec_lookupp),
+ PROC42(LAYOUTERROR, enc_layouterror, dec_layouterror),
};
static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)];
diff --git a/fs/nfs/nfstrace.c b/fs/nfs/nfstrace.c
index b60d5fbd7727..a90b363500c2 100644
--- a/fs/nfs/nfstrace.c
+++ b/fs/nfs/nfstrace.c
@@ -11,3 +11,4 @@
EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_enter);
EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_xdr_status);
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index bd60f8d1e181..a0d6910aa03a 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -969,6 +969,91 @@ TRACE_EVENT(nfs_commit_done,
)
);
+TRACE_DEFINE_ENUM(NFS_OK);
+TRACE_DEFINE_ENUM(NFSERR_PERM);
+TRACE_DEFINE_ENUM(NFSERR_NOENT);
+TRACE_DEFINE_ENUM(NFSERR_IO);
+TRACE_DEFINE_ENUM(NFSERR_NXIO);
+TRACE_DEFINE_ENUM(NFSERR_ACCES);
+TRACE_DEFINE_ENUM(NFSERR_EXIST);
+TRACE_DEFINE_ENUM(NFSERR_XDEV);
+TRACE_DEFINE_ENUM(NFSERR_NODEV);
+TRACE_DEFINE_ENUM(NFSERR_NOTDIR);
+TRACE_DEFINE_ENUM(NFSERR_ISDIR);
+TRACE_DEFINE_ENUM(NFSERR_INVAL);
+TRACE_DEFINE_ENUM(NFSERR_FBIG);
+TRACE_DEFINE_ENUM(NFSERR_NOSPC);
+TRACE_DEFINE_ENUM(NFSERR_ROFS);
+TRACE_DEFINE_ENUM(NFSERR_MLINK);
+TRACE_DEFINE_ENUM(NFSERR_NAMETOOLONG);
+TRACE_DEFINE_ENUM(NFSERR_NOTEMPTY);
+TRACE_DEFINE_ENUM(NFSERR_DQUOT);
+TRACE_DEFINE_ENUM(NFSERR_STALE);
+TRACE_DEFINE_ENUM(NFSERR_REMOTE);
+TRACE_DEFINE_ENUM(NFSERR_WFLUSH);
+TRACE_DEFINE_ENUM(NFSERR_BADHANDLE);
+TRACE_DEFINE_ENUM(NFSERR_NOT_SYNC);
+TRACE_DEFINE_ENUM(NFSERR_BAD_COOKIE);
+TRACE_DEFINE_ENUM(NFSERR_NOTSUPP);
+TRACE_DEFINE_ENUM(NFSERR_TOOSMALL);
+TRACE_DEFINE_ENUM(NFSERR_SERVERFAULT);
+TRACE_DEFINE_ENUM(NFSERR_BADTYPE);
+TRACE_DEFINE_ENUM(NFSERR_JUKEBOX);
+
+#define nfs_show_status(x) \
+ __print_symbolic(x, \
+ { NFS_OK, "OK" }, \
+ { NFSERR_PERM, "PERM" }, \
+ { NFSERR_NOENT, "NOENT" }, \
+ { NFSERR_IO, "IO" }, \
+ { NFSERR_NXIO, "NXIO" }, \
+ { NFSERR_ACCES, "ACCES" }, \
+ { NFSERR_EXIST, "EXIST" }, \
+ { NFSERR_XDEV, "XDEV" }, \
+ { NFSERR_NODEV, "NODEV" }, \
+ { NFSERR_NOTDIR, "NOTDIR" }, \
+ { NFSERR_ISDIR, "ISDIR" }, \
+ { NFSERR_INVAL, "INVAL" }, \
+ { NFSERR_FBIG, "FBIG" }, \
+ { NFSERR_NOSPC, "NOSPC" }, \
+ { NFSERR_ROFS, "ROFS" }, \
+ { NFSERR_MLINK, "MLINK" }, \
+ { NFSERR_NAMETOOLONG, "NAMETOOLONG" }, \
+ { NFSERR_NOTEMPTY, "NOTEMPTY" }, \
+ { NFSERR_DQUOT, "DQUOT" }, \
+ { NFSERR_STALE, "STALE" }, \
+ { NFSERR_REMOTE, "REMOTE" }, \
+ { NFSERR_WFLUSH, "WFLUSH" }, \
+ { NFSERR_BADHANDLE, "BADHANDLE" }, \
+ { NFSERR_NOT_SYNC, "NOTSYNC" }, \
+ { NFSERR_BAD_COOKIE, "BADCOOKIE" }, \
+ { NFSERR_NOTSUPP, "NOTSUPP" }, \
+ { NFSERR_TOOSMALL, "TOOSMALL" }, \
+ { NFSERR_SERVERFAULT, "REMOTEIO" }, \
+ { NFSERR_BADTYPE, "BADTYPE" }, \
+ { NFSERR_JUKEBOX, "JUKEBOX" })
+
+TRACE_EVENT(nfs_xdr_status,
+ TP_PROTO(
+ int error
+ ),
+
+ TP_ARGS(error),
+
+ TP_STRUCT__entry(
+ __field(int, error)
+ ),
+
+ TP_fast_assign(
+ __entry->error = error;
+ ),
+
+ TP_printk(
+ "error=%d (%s)",
+ __entry->error, nfs_show_status(__entry->error)
+ )
+);
+
#endif /* _TRACE_NFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index e54d899c1848..e9f39fa5964b 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -350,7 +350,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
/**
* nfs_unlock_request - Unlock request and wake up sleepers.
- * @req:
+ * @req: pointer to request
*/
void nfs_unlock_request(struct nfs_page *req)
{
@@ -368,7 +368,7 @@ void nfs_unlock_request(struct nfs_page *req)
/**
* nfs_unlock_and_release_request - Unlock request and release the nfs_page
- * @req:
+ * @req: pointer to request
*/
void nfs_unlock_and_release_request(struct nfs_page *req)
{
@@ -531,7 +531,6 @@ EXPORT_SYMBOL_GPL(nfs_pgio_header_free);
* nfs_pgio_rpcsetup - Set up arguments for a pageio call
* @hdr: The pageio hdr
* @count: Number of bytes to read
- * @offset: Initial offset
* @how: How to commit data (writes only)
* @cinfo: Commit information for the call (writes only)
*/
@@ -634,7 +633,6 @@ EXPORT_SYMBOL_GPL(nfs_initiate_pgio);
/**
* nfs_pgio_error - Clean up from a pageio error
- * @desc: IO descriptor
* @hdr: pageio header
*/
static void nfs_pgio_error(struct nfs_pgio_header *hdr)
@@ -768,8 +766,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
pageused = 0;
while (!list_empty(head)) {
req = nfs_list_entry(head->next);
- nfs_list_remove_request(req);
- nfs_list_add_request(req, &hdr->pages);
+ nfs_list_move_request(req, &hdr->pages);
if (!last_page || last_page != req->wb_page) {
pageused++;
@@ -893,6 +890,7 @@ static bool nfs_match_lock_context(const struct nfs_lock_context *l1,
* nfs_can_coalesce_requests - test two requests for compatibility
* @prev: pointer to nfs_page
* @req: pointer to nfs_page
+ * @pgio: pointer to nfs_pagio_descriptor
*
* The nfs_page structures 'prev' and 'req' are compared to ensure that the
* page data area they describe is contiguous, and that their RPC
@@ -961,8 +959,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
}
if (!nfs_can_coalesce_requests(prev, req, desc))
return 0;
- nfs_list_remove_request(req);
- nfs_list_add_request(req, &mirror->pg_list);
+ nfs_list_move_request(req, &mirror->pg_list);
mirror->pg_count += req->wb_bytes;
return 1;
}
@@ -988,6 +985,16 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
}
}
+static void
+nfs_pageio_cleanup_request(struct nfs_pageio_descriptor *desc,
+ struct nfs_page *req)
+{
+ LIST_HEAD(head);
+
+ nfs_list_move_request(req, &head);
+ desc->pg_completion_ops->error_cleanup(&head, desc->pg_error);
+}
+
/**
* nfs_pageio_add_request - Attempt to coalesce a request into a page list.
* @desc: destination io descriptor
@@ -1025,10 +1032,8 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
nfs_page_group_unlock(req);
desc->pg_moreio = 1;
nfs_pageio_doio(desc);
- if (desc->pg_error < 0)
- return 0;
- if (mirror->pg_recoalesce)
- return 0;
+ if (desc->pg_error < 0 || mirror->pg_recoalesce)
+ goto out_cleanup_subreq;
/* retry add_request for this subreq */
nfs_page_group_lock(req);
continue;
@@ -1061,6 +1066,10 @@ err_ptr:
desc->pg_error = PTR_ERR(subreq);
nfs_page_group_unlock(req);
return 0;
+out_cleanup_subreq:
+ if (req != subreq)
+ nfs_pageio_cleanup_request(desc, subreq);
+ return 0;
}
static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
@@ -1079,7 +1088,6 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
struct nfs_page *req;
req = list_first_entry(&head, struct nfs_page, wb_list);
- nfs_list_remove_request(req);
if (__nfs_pageio_add_request(desc, req))
continue;
if (desc->pg_error < 0) {
@@ -1120,7 +1128,8 @@ static void nfs_pageio_error_cleanup(struct nfs_pageio_descriptor *desc)
for (midx = 0; midx < desc->pg_mirror_count; midx++) {
mirror = &desc->pg_mirrors[midx];
- desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
+ desc->pg_completion_ops->error_cleanup(&mirror->pg_list,
+ desc->pg_error);
}
}
@@ -1168,11 +1177,14 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
if (nfs_pgio_has_mirroring(desc))
desc->pg_mirror_idx = midx;
if (!nfs_pageio_add_request_mirror(desc, dupreq))
- goto out_failed;
+ goto out_cleanup_subreq;
}
return 1;
+out_cleanup_subreq:
+ if (req != dupreq)
+ nfs_pageio_cleanup_request(desc, dupreq);
out_failed:
nfs_pageio_error_cleanup(desc);
return 0;
@@ -1194,7 +1206,7 @@ static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc,
desc->pg_mirror_idx = mirror_idx;
for (;;) {
nfs_pageio_doio(desc);
- if (!mirror->pg_recoalesce)
+ if (desc->pg_error < 0 || !mirror->pg_recoalesce)
break;
if (!nfs_do_recoalesce(desc))
break;
@@ -1222,9 +1234,8 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
while (!list_empty(&hdr->pages)) {
struct nfs_page *req = nfs_list_entry(hdr->pages.next);
- nfs_list_remove_request(req);
if (!nfs_pageio_add_request(desc, req))
- nfs_list_add_request(req, &failed);
+ nfs_list_move_request(req, &failed);
}
nfs_pageio_complete(desc);
if (!list_empty(&failed)) {
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 53726da5c010..7066cd7c7aff 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -758,22 +758,35 @@ static int
pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
struct nfs_server *server,
struct list_head *layout_list)
+ __must_hold(&clp->cl_lock)
+ __must_hold(RCU)
{
struct pnfs_layout_hdr *lo, *next;
struct inode *inode;
list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) {
- if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags))
+ if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) ||
+ test_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags) ||
+ !list_empty(&lo->plh_bulk_destroy))
continue;
+ /* If the sb is being destroyed, just bail */
+ if (!nfs_sb_active(server->super))
+ break;
inode = igrab(lo->plh_inode);
- if (inode == NULL)
- continue;
- list_del_init(&lo->plh_layouts);
- if (pnfs_layout_add_bulk_destroy_list(inode, layout_list))
- continue;
- rcu_read_unlock();
- spin_unlock(&clp->cl_lock);
- iput(inode);
+ if (inode != NULL) {
+ list_del_init(&lo->plh_layouts);
+ if (pnfs_layout_add_bulk_destroy_list(inode,
+ layout_list))
+ continue;
+ rcu_read_unlock();
+ spin_unlock(&clp->cl_lock);
+ iput(inode);
+ } else {
+ rcu_read_unlock();
+ spin_unlock(&clp->cl_lock);
+ set_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags);
+ }
+ nfs_sb_deactive(server->super);
spin_lock(&clp->cl_lock);
rcu_read_lock();
return -EAGAIN;
@@ -811,7 +824,7 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
/* Free all lsegs that are attached to commit buckets */
nfs_commit_inode(inode, 0);
pnfs_put_layout_hdr(lo);
- iput(inode);
+ nfs_iput_and_deactive(inode);
}
return ret;
}
@@ -1876,7 +1889,7 @@ lookup_again:
atomic_read(&lo->plh_outstanding) != 0) {
spin_unlock(&ino->i_lock);
lseg = ERR_PTR(wait_var_event_killable(&lo->plh_outstanding,
- atomic_read(&lo->plh_outstanding)));
+ !atomic_read(&lo->plh_outstanding)));
if (IS_ERR(lseg) || !list_empty(&lo->plh_segs))
goto out_put_layout_hdr;
pnfs_put_layout_hdr(lo);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 5e80a07b7bea..c0420b979d88 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -104,6 +104,7 @@ enum {
NFS_LAYOUT_RETURN_REQUESTED, /* Return this layout ASAP */
NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */
NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */
+ NFS_LAYOUT_INODE_FREEING, /* The inode is being freed */
};
enum layoutdriver_policy_flags {
@@ -349,6 +350,7 @@ void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nf
void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, struct nfs_server *,
const struct nfs4_deviceid *);
bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
+void nfs4_mark_deviceid_available(struct nfs4_deviceid_node *node);
void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node);
bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node);
void nfs4_deviceid_purge_client(const struct nfs_client *);
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 7fb59487ee90..537b80d693f1 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -284,10 +284,22 @@ nfs4_put_deviceid_node(struct nfs4_deviceid_node *d)
EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node);
void
+nfs4_mark_deviceid_available(struct nfs4_deviceid_node *node)
+{
+ if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) {
+ clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
+ smp_mb__after_atomic();
+ }
+}
+EXPORT_SYMBOL_GPL(nfs4_mark_deviceid_available);
+
+void
nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node)
{
node->timestamp_unavailable = jiffies;
+ smp_mb__before_atomic();
set_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
+ smp_mb__after_atomic();
}
EXPORT_SYMBOL_GPL(nfs4_mark_deviceid_unavailable);
@@ -302,6 +314,7 @@ nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node)
if (time_in_range(node->timestamp_unavailable, start, end))
return true;
clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
+ smp_mb__after_atomic();
}
return false;
}
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index f9f19784db82..1d95a60b2586 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -205,7 +205,7 @@ static void nfs_initiate_read(struct nfs_pgio_header *hdr,
}
static void
-nfs_async_read_error(struct list_head *head)
+nfs_async_read_error(struct list_head *head, int error)
{
struct nfs_page *req;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0570391eaa16..23790c7b2289 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1919,7 +1919,7 @@ static int nfs_parse_devname(const char *dev_name,
/* kill possible hostname list: not supported */
comma = strchr(dev_name, ',');
if (comma != NULL && comma < end)
- *comma = 0;
+ len = comma - dev_name;
}
if (len > maxnamlen)
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 79b97b3c4427..52d533967485 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -39,6 +39,7 @@ nfs_free_unlinkdata(struct nfs_unlinkdata *data)
/**
* nfs_async_unlink_done - Sillydelete post-processing
* @task: rpc_task of the sillydelete
+ * @calldata: pointer to nfs_unlinkdata
*
* Do the directory attribute update.
*/
@@ -54,7 +55,7 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
/**
* nfs_async_unlink_release - Release the sillydelete data.
- * @task: rpc_task of the sillydelete
+ * @calldata: struct nfs_unlinkdata to release
*
* We need to call nfs_put_unlinkdata as a 'tk_release' task since the
* rpc_task would be freed too.
@@ -159,8 +160,8 @@ static int nfs_call_unlink(struct dentry *dentry, struct inode *inode, struct nf
/**
* nfs_async_unlink - asynchronous unlinking of a file
- * @dir: parent directory of dentry
- * @dentry: dentry to unlink
+ * @dentry: parent directory of dentry
+ * @name: name of dentry to unlink
*/
static int
nfs_async_unlink(struct dentry *dentry, const struct qstr *name)
@@ -324,6 +325,7 @@ static const struct rpc_call_ops nfs_rename_ops = {
* @new_dir: target directory for the rename
* @old_dentry: original dentry to be renamed
* @new_dentry: dentry to which the old_dentry should be renamed
+ * @complete: Function to run on successful completion
*
* It's expected that valid references to the dentries and inodes are held
*/
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index d09c9f878141..f3ebabaa291d 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -26,6 +26,7 @@
#include <linux/iversion.h>
#include <linux/uaccess.h>
+#include <linux/sched/mm.h>
#include "delegation.h"
#include "internal.h"
@@ -712,11 +713,13 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
struct nfs_pageio_descriptor pgio;
- struct nfs_io_completion *ioc = nfs_io_completion_alloc(GFP_NOFS);
+ struct nfs_io_completion *ioc;
+ unsigned int pflags = memalloc_nofs_save();
int err;
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
+ ioc = nfs_io_completion_alloc(GFP_NOFS);
if (ioc)
nfs_io_completion_init(ioc, nfs_io_completion_commit, inode);
@@ -727,6 +730,8 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
nfs_pageio_complete(&pgio);
nfs_io_completion_put(ioc);
+ memalloc_nofs_restore(pflags);
+
if (err < 0)
goto out_err;
err = pgio.pg_error;
@@ -865,7 +870,6 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked);
/**
* nfs_request_add_commit_list - add request to a commit list
* @req: pointer to a struct nfs_page
- * @dst: commit list head
* @cinfo: holds list lock and accounting info
*
* This sets the PG_CLEAN bit, updates the cinfo count of
@@ -1412,20 +1416,27 @@ static void nfs_redirty_request(struct nfs_page *req)
nfs_release_request(req);
}
-static void nfs_async_write_error(struct list_head *head)
+static void nfs_async_write_error(struct list_head *head, int error)
{
struct nfs_page *req;
while (!list_empty(head)) {
req = nfs_list_entry(head->next);
nfs_list_remove_request(req);
+ if (nfs_error_is_fatal(error)) {
+ nfs_context_set_write_error(req->wb_context, error);
+ if (nfs_error_is_fatal_on_server(error)) {
+ nfs_write_error_remove_page(req);
+ continue;
+ }
+ }
nfs_redirty_request(req);
}
}
static void nfs_async_write_reschedule_io(struct nfs_pgio_header *hdr)
{
- nfs_async_write_error(&hdr->pages);
+ nfs_async_write_error(&hdr->pages, 0);
filemap_fdatawrite_range(hdr->inode->i_mapping, hdr->args.offset,
hdr->args.offset + hdr->args.count - 1);
}
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 9eb8086ea841..8f933e84cec1 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -463,8 +463,19 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp)
&resp->common, nfs3svc_encode_entry);
memcpy(resp->verf, argp->verf, 8);
resp->count = resp->buffer - argp->buffer;
- if (resp->offset)
- xdr_encode_hyper(resp->offset, argp->cookie);
+ if (resp->offset) {
+ loff_t offset = argp->cookie;
+
+ if (unlikely(resp->offset1)) {
+ /* we ended up with offset on a page boundary */
+ *resp->offset = htonl(offset >> 32);
+ *resp->offset1 = htonl(offset & 0xffffffff);
+ resp->offset1 = NULL;
+ } else {
+ xdr_encode_hyper(resp->offset, offset);
+ }
+ resp->offset = NULL;
+ }
RETURN_STATUS(nfserr);
}
@@ -533,6 +544,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp)
} else {
xdr_encode_hyper(resp->offset, offset);
}
+ resp->offset = NULL;
}
RETURN_STATUS(nfserr);
@@ -576,7 +588,7 @@ nfsd3_proc_fsinfo(struct svc_rqst *rqstp)
resp->f_wtmax = max_blocksize;
resp->f_wtpref = max_blocksize;
resp->f_wtmult = PAGE_SIZE;
- resp->f_dtpref = PAGE_SIZE;
+ resp->f_dtpref = max_blocksize;
resp->f_maxfilesize = ~(u32) 0;
resp->f_properties = NFS3_FSF_DEFAULT;
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 9b973f4f7d01..93fea246f676 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -573,6 +573,8 @@ int
nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd3_readdirargs *args = rqstp->rq_argp;
+ u32 max_blocksize = svc_max_payload(rqstp);
+
p = decode_fh(p, &args->fh);
if (!p)
return 0;
@@ -580,7 +582,7 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p)
args->verf = p; p += 2;
args->dircount = ~0;
args->count = ntohl(*p++);
- args->count = min_t(u32, args->count, PAGE_SIZE);
+ args->count = min_t(u32, args->count, max_blocksize);
args->buffer = page_address(*(rqstp->rq_next_page++));
return xdr_argsize_check(rqstp, p);
@@ -921,6 +923,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
} else {
xdr_encode_hyper(cd->offset, offset64);
}
+ cd->offset = NULL;
}
/*
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index c74e4538d0eb..d219159b98af 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -60,16 +60,6 @@ struct nfs4_cb_compound_hdr {
int status;
};
-/*
- * Handle decode buffer overflows out-of-line.
- */
-static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-{
- dprintk("NFS: %s prematurely hit the end of our receive buffer. "
- "Remaining buffer length is %tu words.\n",
- func, xdr->end - xdr->p);
-}
-
static __be32 *xdr_encode_empty_array(__be32 *p)
{
*p++ = xdr_zero;
@@ -240,7 +230,6 @@ static int decode_cb_op_status(struct xdr_stream *xdr,
*status = nfs_cb_stat_to_errno(be32_to_cpup(p));
return 0;
out_overflow:
- print_overflow_msg(__func__, xdr);
return -EIO;
out_unexpected:
dprintk("NFSD: Callback server returned operation %d but "
@@ -309,7 +298,6 @@ static int decode_cb_compound4res(struct xdr_stream *xdr,
hdr->nops = be32_to_cpup(p);
return 0;
out_overflow:
- print_overflow_msg(__func__, xdr);
return -EIO;
}
@@ -437,7 +425,6 @@ out:
cb->cb_seq_status = status;
return status;
out_overflow:
- print_overflow_msg(__func__, xdr);
status = -EIO;
goto out;
}
@@ -913,9 +900,9 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
return PTR_ERR(client);
}
cred = get_backchannel_cred(clp, client, ses);
- if (IS_ERR(cred)) {
+ if (!cred) {
rpc_shutdown_client(client);
- return PTR_ERR(cred);
+ return -ENOMEM;
}
clp->cl_cb_client = client;
clp->cl_cb_cred = cred;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index fb3c9844c82a..6a45fb00c5fc 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1544,16 +1544,16 @@ static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca)
{
u32 slotsize = slot_bytes(ca);
u32 num = ca->maxreqs;
- int avail;
+ unsigned long avail, total_avail;
spin_lock(&nfsd_drc_lock);
- avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION,
- nfsd_drc_max_mem - nfsd_drc_mem_used);
+ total_avail = nfsd_drc_max_mem - nfsd_drc_mem_used;
+ avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, total_avail);
/*
* Never use more than a third of the remaining memory,
* unless it's the only way to give this client a slot:
*/
- avail = clamp_t(int, avail, slotsize, avail/3);
+ avail = clamp_t(int, avail, slotsize, total_avail/3);
num = min_t(int, num, avail / slotsize);
nfsd_drc_mem_used += num * slotsize;
spin_unlock(&nfsd_drc_lock);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 72a7681f4046..f2feb2d11bae 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1126,7 +1126,7 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
case 'Y':
case 'y':
case '1':
- if (nn->nfsd_serv)
+ if (!nn->nfsd_serv)
return -EBUSY;
nfsd4_end_grace(nn);
break;
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index f2129a5d9f23..4391fd3abd8f 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -189,7 +189,7 @@ retry:
*/
if (!err)
return 0;
- else if (err != -EEXIST)
+ else if (err != -EBUSY)
goto failed_unlock;
err = invalidate_inode_pages2_range(btnc, newkey, newkey);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 56992b32c6bb..a90bb19dcfa2 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -208,6 +208,7 @@ static int copy_fid_to_user(struct fanotify_event *event, char __user *buf)
{
struct fanotify_event_info_fid info = { };
struct file_handle handle = { };
+ unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh;
size_t fh_len = event->fh_len;
size_t len = fanotify_event_info_len(event);
@@ -233,7 +234,16 @@ static int copy_fid_to_user(struct fanotify_event *event, char __user *buf)
buf += sizeof(handle);
len -= sizeof(handle);
- if (copy_to_user(buf, fanotify_event_fh(event), fh_len))
+ /*
+ * For an inline fh, copy through stack to exclude the copy from
+ * usercopy hardening protections.
+ */
+ fh = fanotify_event_fh(event);
+ if (fh_len <= FANOTIFY_INLINE_FH_LEN) {
+ memcpy(bounce, fh, fh_len);
+ fh = bounce;
+ }
+ if (copy_to_user(buf, fh, fh_len))
return -EFAULT;
/* Pad with 0's */
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index e2901fbb9f76..7b53598c8804 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -519,8 +519,10 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
if (!fsn_mark)
return -ENOENT;
- else if (create)
- return -EEXIST;
+ else if (create) {
+ ret = -EEXIST;
+ goto out;
+ }
i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
@@ -548,6 +550,7 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
/* return the wd */
ret = i_mark->wd;
+out:
/* match the get from fsnotify_find_mark() */
fsnotify_put_mark(fsn_mark);
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index f038235c64bd..c3334eca18c7 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -261,11 +261,8 @@ int orangefs_getattr(const struct path *path, struct kstat *stat,
generic_fillattr(inode, stat);
/* override block size reported to stat */
- if (request_mask & STATX_SIZE)
- stat->result_mask = STATX_BASIC_STATS;
- else
- stat->result_mask = STATX_BASIC_STATS &
- ~STATX_SIZE;
+ if (!(request_mask & STATX_SIZE))
+ stat->result_mask &= ~STATX_SIZE;
stat->attributes_mask = STATX_ATTR_IMMUTABLE |
STATX_ATTR_APPEND;
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 9e62dcf06fc4..68b3303e4b46 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -443,6 +443,24 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
{
int err;
+ /*
+ * Copy up data first and then xattrs. Writing data after
+ * xattrs will remove security.capability xattr automatically.
+ */
+ if (S_ISREG(c->stat.mode) && !c->metacopy) {
+ struct path upperpath, datapath;
+
+ ovl_path_upper(c->dentry, &upperpath);
+ if (WARN_ON(upperpath.dentry != NULL))
+ return -EIO;
+ upperpath.dentry = temp;
+
+ ovl_path_lowerdata(c->dentry, &datapath);
+ err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size);
+ if (err)
+ return err;
+ }
+
err = ovl_copy_xattr(c->lowerpath.dentry, temp);
if (err)
return err;
@@ -460,19 +478,6 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
return err;
}
- if (S_ISREG(c->stat.mode) && !c->metacopy) {
- struct path upperpath, datapath;
-
- ovl_path_upper(c->dentry, &upperpath);
- BUG_ON(upperpath.dentry != NULL);
- upperpath.dentry = temp;
-
- ovl_path_lowerdata(c->dentry, &datapath);
- err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size);
- if (err)
- return err;
- }
-
if (c->metacopy) {
err = ovl_check_setxattr(c->dentry, temp, OVL_XATTR_METACOPY,
NULL, 0, -EOPNOTSUPP);
@@ -737,6 +742,8 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c)
{
struct path upperpath, datapath;
int err;
+ char *capability = NULL;
+ ssize_t uninitialized_var(cap_size);
ovl_path_upper(c->dentry, &upperpath);
if (WARN_ON(upperpath.dentry == NULL))
@@ -746,15 +753,37 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c)
if (WARN_ON(datapath.dentry == NULL))
return -EIO;
+ if (c->stat.size) {
+ err = cap_size = ovl_getxattr(upperpath.dentry, XATTR_NAME_CAPS,
+ &capability, 0);
+ if (err < 0 && err != -ENODATA)
+ goto out;
+ }
+
err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size);
if (err)
- return err;
+ goto out_free;
+
+ /*
+ * Writing to upper file will clear security.capability xattr. We
+ * don't want that to happen for normal copy-up operation.
+ */
+ if (capability) {
+ err = ovl_do_setxattr(upperpath.dentry, XATTR_NAME_CAPS,
+ capability, cap_size, 0);
+ if (err)
+ goto out_free;
+ }
+
err = vfs_removexattr(upperpath.dentry, OVL_XATTR_METACOPY);
if (err)
- return err;
+ goto out_free;
ovl_set_upperdata(d_inode(c->dentry));
+out_free:
+ kfree(capability);
+out:
return err;
}
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 5e45cb3630a0..9c6018287d57 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -277,6 +277,8 @@ int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir);
int ovl_check_metacopy_xattr(struct dentry *dentry);
bool ovl_is_metacopy_dentry(struct dentry *dentry);
char *ovl_get_redirect_xattr(struct dentry *dentry, int padding);
+ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value,
+ size_t padding);
static inline bool ovl_is_impuredir(struct dentry *dentry)
{
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 7c01327b1852..4035e640f402 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -863,28 +863,49 @@ bool ovl_is_metacopy_dentry(struct dentry *dentry)
return (oe->numlower > 1);
}
-char *ovl_get_redirect_xattr(struct dentry *dentry, int padding)
+ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value,
+ size_t padding)
{
- int res;
- char *s, *next, *buf = NULL;
+ ssize_t res;
+ char *buf = NULL;
- res = vfs_getxattr(dentry, OVL_XATTR_REDIRECT, NULL, 0);
+ res = vfs_getxattr(dentry, name, NULL, 0);
if (res < 0) {
if (res == -ENODATA || res == -EOPNOTSUPP)
- return NULL;
+ return -ENODATA;
goto fail;
}
- buf = kzalloc(res + padding + 1, GFP_KERNEL);
- if (!buf)
- return ERR_PTR(-ENOMEM);
+ if (res != 0) {
+ buf = kzalloc(res + padding, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
- if (res == 0)
- goto invalid;
+ res = vfs_getxattr(dentry, name, buf, res);
+ if (res < 0)
+ goto fail;
+ }
+ *value = buf;
+
+ return res;
+
+fail:
+ pr_warn_ratelimited("overlayfs: failed to get xattr %s: err=%zi)\n",
+ name, res);
+ kfree(buf);
+ return res;
+}
- res = vfs_getxattr(dentry, OVL_XATTR_REDIRECT, buf, res);
+char *ovl_get_redirect_xattr(struct dentry *dentry, int padding)
+{
+ int res;
+ char *s, *next, *buf = NULL;
+
+ res = ovl_getxattr(dentry, OVL_XATTR_REDIRECT, &buf, padding + 1);
+ if (res == -ENODATA)
+ return NULL;
if (res < 0)
- goto fail;
+ return ERR_PTR(res);
if (res == 0)
goto invalid;
@@ -900,15 +921,9 @@ char *ovl_get_redirect_xattr(struct dentry *dentry, int padding)
}
return buf;
-
-err_free:
- kfree(buf);
- return ERR_PTR(res);
-fail:
- pr_warn_ratelimited("overlayfs: failed to get redirect (%i)\n", res);
- goto err_free;
invalid:
pr_warn_ratelimited("overlayfs: invalid redirect (%s)\n", buf);
res = -EINVAL;
- goto err_free;
+ kfree(buf);
+ return ERR_PTR(res);
}
diff --git a/fs/pipe.c b/fs/pipe.c
index 51d5fd8840ab..070aad543382 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -225,8 +225,15 @@ void generic_pipe_buf_release(struct pipe_inode_info *pipe,
}
EXPORT_SYMBOL(generic_pipe_buf_release);
+/* New data written to a pipe may be appended to a buffer with this type. */
static const struct pipe_buf_operations anon_pipe_buf_ops = {
- .can_merge = 1,
+ .confirm = generic_pipe_buf_confirm,
+ .release = anon_pipe_buf_release,
+ .steal = anon_pipe_buf_steal,
+ .get = generic_pipe_buf_get,
+};
+
+static const struct pipe_buf_operations anon_pipe_buf_nomerge_ops = {
.confirm = generic_pipe_buf_confirm,
.release = anon_pipe_buf_release,
.steal = anon_pipe_buf_steal,
@@ -234,13 +241,32 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = {
};
static const struct pipe_buf_operations packet_pipe_buf_ops = {
- .can_merge = 0,
.confirm = generic_pipe_buf_confirm,
.release = anon_pipe_buf_release,
.steal = anon_pipe_buf_steal,
.get = generic_pipe_buf_get,
};
+/**
+ * pipe_buf_mark_unmergeable - mark a &struct pipe_buffer as unmergeable
+ * @buf: the buffer to mark
+ *
+ * Description:
+ * This function ensures that no future writes will be merged into the
+ * given &struct pipe_buffer. This is necessary when multiple pipe buffers
+ * share the same backing page.
+ */
+void pipe_buf_mark_unmergeable(struct pipe_buffer *buf)
+{
+ if (buf->ops == &anon_pipe_buf_ops)
+ buf->ops = &anon_pipe_buf_nomerge_ops;
+}
+
+static bool pipe_buf_can_merge(struct pipe_buffer *buf)
+{
+ return buf->ops == &anon_pipe_buf_ops;
+}
+
static ssize_t
pipe_read(struct kiocb *iocb, struct iov_iter *to)
{
@@ -378,7 +404,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
struct pipe_buffer *buf = pipe->bufs + lastbuf;
int offset = buf->offset + buf->len;
- if (buf->ops->can_merge && offset + chars <= PAGE_SIZE) {
+ if (pipe_buf_can_merge(buf) && offset + chars <= PAGE_SIZE) {
ret = pipe_buf_confirm(pipe, buf);
if (ret)
goto out;
diff --git a/fs/pnode.c b/fs/pnode.c
index 1100e810d855..7ea6cfb65077 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -214,7 +214,6 @@ static struct mount *next_group(struct mount *m, struct mount *origin)
}
/* all accesses are serialized by namespace_sem */
-static struct user_namespace *user_ns;
static struct mount *last_dest, *first_source, *last_source, *dest_master;
static struct mountpoint *mp;
static struct hlist_head *list;
@@ -260,9 +259,6 @@ static int propagate_one(struct mount *m)
type |= CL_MAKE_SHARED;
}
- /* Notice when we are propagating across user namespaces */
- if (m->mnt_ns->user_ns != user_ns)
- type |= CL_UNPRIVILEGED;
child = copy_tree(last_source, last_source->mnt.mnt_root, type);
if (IS_ERR(child))
return PTR_ERR(child);
@@ -303,7 +299,6 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
* propagate_one(); everything is serialized by namespace_sem,
* so globals will do just fine.
*/
- user_ns = current->nsproxy->mnt_ns->user_ns;
last_dest = dest_mnt;
first_source = source_mnt;
last_source = source_mnt;
diff --git a/fs/pnode.h b/fs/pnode.h
index dc87e65becd2..3960a83666cf 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -27,8 +27,7 @@
#define CL_MAKE_SHARED 0x08
#define CL_PRIVATE 0x10
#define CL_SHARED_TO_SLAVE 0x20
-#define CL_UNPRIVILEGED 0x40
-#define CL_COPY_MNT_NS_FILE 0x80
+#define CL_COPY_MNT_NS_FILE 0x40
#define CL_COPY_ALL (CL_COPY_UNBINDABLE | CL_COPY_MNT_NS_FILE)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5ab1849971b4..ddef482f1334 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -59,6 +59,7 @@
#include <linux/capability.h>
#include <linux/file.h>
#include <linux/fdtable.h>
+#include <linux/generic-radix-tree.h>
#include <linux/string.h>
#include <linux/seq_file.h>
#include <linux/namei.h>
@@ -92,7 +93,6 @@
#include <linux/sched/coredump.h>
#include <linux/sched/debug.h>
#include <linux/sched/stat.h>
-#include <linux/flex_array.h>
#include <linux/posix-timers.h>
#include <trace/events/oom.h>
#include "internal.h"
@@ -2142,11 +2142,12 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
struct task_struct *task;
struct mm_struct *mm;
unsigned long nr_files, pos, i;
- struct flex_array *fa = NULL;
- struct map_files_info info;
+ GENRADIX(struct map_files_info) fa;
struct map_files_info *p;
int ret;
+ genradix_init(&fa);
+
ret = -ENOENT;
task = get_proc_task(file_inode(file));
if (!task)
@@ -2178,35 +2179,22 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
*/
for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
- if (vma->vm_file && ++pos > ctx->pos)
- nr_files++;
- }
+ if (!vma->vm_file)
+ continue;
+ if (++pos <= ctx->pos)
+ continue;
- if (nr_files) {
- fa = flex_array_alloc(sizeof(info), nr_files,
- GFP_KERNEL);
- if (!fa || flex_array_prealloc(fa, 0, nr_files,
- GFP_KERNEL)) {
+ p = genradix_ptr_alloc(&fa, nr_files++, GFP_KERNEL);
+ if (!p) {
ret = -ENOMEM;
- if (fa)
- flex_array_free(fa);
up_read(&mm->mmap_sem);
mmput(mm);
goto out_put_task;
}
- for (i = 0, vma = mm->mmap, pos = 2; vma;
- vma = vma->vm_next) {
- if (!vma->vm_file)
- continue;
- if (++pos <= ctx->pos)
- continue;
- info.start = vma->vm_start;
- info.end = vma->vm_end;
- info.mode = vma->vm_file->f_mode;
- if (flex_array_put(fa, i++, &info, GFP_KERNEL))
- BUG();
- }
+ p->start = vma->vm_start;
+ p->end = vma->vm_end;
+ p->mode = vma->vm_file->f_mode;
}
up_read(&mm->mmap_sem);
mmput(mm);
@@ -2215,7 +2203,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */
unsigned int len;
- p = flex_array_get(fa, i);
+ p = genradix_ptr(&fa, i);
len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end);
if (!proc_fill_cache(file, ctx,
buf, len,
@@ -2225,12 +2213,11 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
break;
ctx->pos++;
}
- if (fa)
- flex_array_free(fa);
out_put_task:
put_task_struct(task);
out:
+ genradix_free(&fa);
return ret;
}
@@ -2459,11 +2446,10 @@ static struct dentry *proc_pident_instantiate(struct dentry *dentry,
static struct dentry *proc_pident_lookup(struct inode *dir,
struct dentry *dentry,
- const struct pid_entry *ents,
- unsigned int nents)
+ const struct pid_entry *p,
+ const struct pid_entry *end)
{
struct task_struct *task = get_proc_task(dir);
- const struct pid_entry *p, *last;
struct dentry *res = ERR_PTR(-ENOENT);
if (!task)
@@ -2473,8 +2459,7 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
* Yes, it does not scale. And it should not. Don't add
* new entries into /proc/<tgid>/ without very good reasons.
*/
- last = &ents[nents];
- for (p = ents; p < last; p++) {
+ for (; p < end; p++) {
if (p->len != dentry->d_name.len)
continue;
if (!memcmp(dentry->d_name.name, p->name, p->len)) {
@@ -2610,7 +2595,7 @@ static struct dentry *proc_##LSM##_attr_dir_lookup(struct inode *dir, \
{ \
return proc_pident_lookup(dir, dentry, \
LSM##_attr_dir_stuff, \
- ARRAY_SIZE(LSM##_attr_dir_stuff)); \
+ LSM##_attr_dir_stuff + ARRAY_SIZE(LSM##_attr_dir_stuff)); \
} \
\
static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \
@@ -2655,7 +2640,8 @@ static struct dentry *proc_attr_dir_lookup(struct inode *dir,
struct dentry *dentry, unsigned int flags)
{
return proc_pident_lookup(dir, dentry,
- attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
+ attr_dir_stuff,
+ attr_dir_stuff + ARRAY_SIZE(attr_dir_stuff));
}
static const struct inode_operations proc_attr_dir_inode_operations = {
@@ -3088,10 +3074,20 @@ static const struct file_operations proc_tgid_base_operations = {
.llseek = generic_file_llseek,
};
+struct pid *tgid_pidfd_to_pid(const struct file *file)
+{
+ if (!d_is_dir(file->f_path.dentry) ||
+ (file->f_op != &proc_tgid_base_operations))
+ return ERR_PTR(-EBADF);
+
+ return proc_pid(file_inode(file));
+}
+
static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
return proc_pident_lookup(dir, dentry,
- tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
+ tgid_base_stuff,
+ tgid_base_stuff + ARRAY_SIZE(tgid_base_stuff));
}
static const struct inode_operations proc_tgid_base_inode_operations = {
@@ -3463,7 +3459,8 @@ static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
return proc_pident_lookup(dir, dentry,
- tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
+ tid_base_stuff,
+ tid_base_stuff + ARRAY_SIZE(tid_base_stuff));
}
static const struct file_operations proc_tid_base_operations = {
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index da649ccd6804..fc7e38def174 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -24,7 +24,6 @@
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/mount.h>
-#include <linux/magic.h>
#include <linux/uaccess.h>
@@ -122,13 +121,12 @@ static int proc_show_options(struct seq_file *seq, struct dentry *root)
return 0;
}
-static const struct super_operations proc_sops = {
+const struct super_operations proc_sops = {
.alloc_inode = proc_alloc_inode,
.destroy_inode = proc_destroy_inode,
.drop_inode = generic_delete_inode,
.evict_inode = proc_evict_inode,
.statfs = simple_statfs,
- .remount_fs = proc_remount,
.show_options = proc_show_options,
};
@@ -488,51 +486,3 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
pde_put(de);
return inode;
}
-
-int proc_fill_super(struct super_block *s, void *data, int silent)
-{
- struct pid_namespace *ns = get_pid_ns(s->s_fs_info);
- struct inode *root_inode;
- int ret;
-
- if (!proc_parse_options(data, ns))
- return -EINVAL;
-
- /* User space would break if executables or devices appear on proc */
- s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
- s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC;
- s->s_blocksize = 1024;
- s->s_blocksize_bits = 10;
- s->s_magic = PROC_SUPER_MAGIC;
- s->s_op = &proc_sops;
- s->s_time_gran = 1;
-
- /*
- * procfs isn't actually a stacking filesystem; however, there is
- * too much magic going on inside it to permit stacking things on
- * top of it
- */
- s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
-
- /* procfs dentries and inodes don't require IO to create */
- s->s_shrink.seeks = 0;
-
- pde_get(&proc_root);
- root_inode = proc_get_inode(s, &proc_root);
- if (!root_inode) {
- pr_err("proc_fill_super: get root inode failed\n");
- return -ENOMEM;
- }
-
- s->s_root = d_make_root(root_inode);
- if (!s->s_root) {
- pr_err("proc_fill_super: allocate dentry failed\n");
- return -ENOMEM;
- }
-
- ret = proc_setup_self(s);
- if (ret) {
- return ret;
- }
- return proc_setup_thread_self(s);
-}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index ea575375f210..d1671e97f7fe 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -207,13 +207,12 @@ struct pde_opener {
struct completion *c;
} __randomize_layout;
extern const struct inode_operations proc_link_inode_operations;
-
extern const struct inode_operations proc_pid_link_inode_operations;
+extern const struct super_operations proc_sops;
void proc_init_kmemcache(void);
void set_proc_pid_nlink(void);
extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
-extern int proc_fill_super(struct super_block *, void *data, int flags);
extern void proc_entry_rundown(struct proc_dir_entry *);
/*
@@ -271,10 +270,8 @@ static inline void proc_tty_init(void) {}
* root.c
*/
extern struct proc_dir_entry proc_root;
-extern int proc_parse_options(char *options, struct pid_namespace *pid);
extern void proc_self_init(void);
-extern int proc_remount(struct super_block *, int *, char *);
/*
* task_[no]mmu.c
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 621e6ec322ca..8b145e7b9661 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -19,86 +19,178 @@
#include <linux/module.h>
#include <linux/bitops.h>
#include <linux/user_namespace.h>
+#include <linux/fs_context.h>
#include <linux/mount.h>
#include <linux/pid_namespace.h>
-#include <linux/parser.h>
+#include <linux/fs_parser.h>
#include <linux/cred.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
#include "internal.h"
-enum {
- Opt_gid, Opt_hidepid, Opt_err,
+struct proc_fs_context {
+ struct pid_namespace *pid_ns;
+ unsigned int mask;
+ int hidepid;
+ int gid;
};
-static const match_table_t tokens = {
- {Opt_hidepid, "hidepid=%u"},
- {Opt_gid, "gid=%u"},
- {Opt_err, NULL},
+enum proc_param {
+ Opt_gid,
+ Opt_hidepid,
};
-int proc_parse_options(char *options, struct pid_namespace *pid)
+static const struct fs_parameter_spec proc_param_specs[] = {
+ fsparam_u32("gid", Opt_gid),
+ fsparam_u32("hidepid", Opt_hidepid),
+ {}
+};
+
+static const struct fs_parameter_description proc_fs_parameters = {
+ .name = "proc",
+ .specs = proc_param_specs,
+};
+
+static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int option;
-
- if (!options)
- return 1;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
- if (!*p)
- continue;
-
- args[0].to = args[0].from = NULL;
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_gid:
- if (match_int(&args[0], &option))
- return 0;
- pid->pid_gid = make_kgid(current_user_ns(), option);
- break;
- case Opt_hidepid:
- if (match_int(&args[0], &option))
- return 0;
- if (option < HIDEPID_OFF ||
- option > HIDEPID_INVISIBLE) {
- pr_err("proc: hidepid value must be between 0 and 2.\n");
- return 0;
- }
- pid->hide_pid = option;
- break;
- default:
- pr_err("proc: unrecognized mount option \"%s\" "
- "or missing value\n", p);
- return 0;
- }
+ struct proc_fs_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, &proc_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_gid:
+ ctx->gid = result.uint_32;
+ break;
+
+ case Opt_hidepid:
+ ctx->hidepid = result.uint_32;
+ if (ctx->hidepid < HIDEPID_OFF ||
+ ctx->hidepid > HIDEPID_INVISIBLE)
+ return invalf(fc, "proc: hidepid value must be between 0 and 2.\n");
+ break;
+
+ default:
+ return -EINVAL;
}
- return 1;
+ ctx->mask |= 1 << opt;
+ return 0;
}
-int proc_remount(struct super_block *sb, int *flags, char *data)
+static void proc_apply_options(struct super_block *s,
+ struct fs_context *fc,
+ struct pid_namespace *pid_ns,
+ struct user_namespace *user_ns)
{
+ struct proc_fs_context *ctx = fc->fs_private;
+
+ if (ctx->mask & (1 << Opt_gid))
+ pid_ns->pid_gid = make_kgid(user_ns, ctx->gid);
+ if (ctx->mask & (1 << Opt_hidepid))
+ pid_ns->hide_pid = ctx->hidepid;
+}
+
+static int proc_fill_super(struct super_block *s, struct fs_context *fc)
+{
+ struct pid_namespace *pid_ns = get_pid_ns(s->s_fs_info);
+ struct inode *root_inode;
+ int ret;
+
+ proc_apply_options(s, fc, pid_ns, current_user_ns());
+
+ /* User space would break if executables or devices appear on proc */
+ s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
+ s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC;
+ s->s_blocksize = 1024;
+ s->s_blocksize_bits = 10;
+ s->s_magic = PROC_SUPER_MAGIC;
+ s->s_op = &proc_sops;
+ s->s_time_gran = 1;
+
+ /*
+ * procfs isn't actually a stacking filesystem; however, there is
+ * too much magic going on inside it to permit stacking things on
+ * top of it
+ */
+ s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
+
+ /* procfs dentries and inodes don't require IO to create */
+ s->s_shrink.seeks = 0;
+
+ pde_get(&proc_root);
+ root_inode = proc_get_inode(s, &proc_root);
+ if (!root_inode) {
+ pr_err("proc_fill_super: get root inode failed\n");
+ return -ENOMEM;
+ }
+
+ s->s_root = d_make_root(root_inode);
+ if (!s->s_root) {
+ pr_err("proc_fill_super: allocate dentry failed\n");
+ return -ENOMEM;
+ }
+
+ ret = proc_setup_self(s);
+ if (ret) {
+ return ret;
+ }
+ return proc_setup_thread_self(s);
+}
+
+static int proc_reconfigure(struct fs_context *fc)
+{
+ struct super_block *sb = fc->root->d_sb;
struct pid_namespace *pid = sb->s_fs_info;
sync_filesystem(sb);
- return !proc_parse_options(data, pid);
+
+ proc_apply_options(sb, fc, pid, current_user_ns());
+ return 0;
}
-static struct dentry *proc_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int proc_get_tree(struct fs_context *fc)
{
- struct pid_namespace *ns;
+ struct proc_fs_context *ctx = fc->fs_private;
- if (flags & SB_KERNMOUNT) {
- ns = data;
- data = NULL;
- } else {
- ns = task_active_pid_ns(current);
- }
+ put_user_ns(fc->user_ns);
+ fc->user_ns = get_user_ns(ctx->pid_ns->user_ns);
+ fc->s_fs_info = ctx->pid_ns;
+ return vfs_get_super(fc, vfs_get_keyed_super, proc_fill_super);
+}
- return mount_ns(fs_type, flags, data, ns, ns->user_ns, proc_fill_super);
+static void proc_fs_context_free(struct fs_context *fc)
+{
+ struct proc_fs_context *ctx = fc->fs_private;
+
+ if (ctx->pid_ns)
+ put_pid_ns(ctx->pid_ns);
+ kfree(ctx);
+}
+
+static const struct fs_context_operations proc_fs_context_ops = {
+ .free = proc_fs_context_free,
+ .parse_param = proc_parse_param,
+ .get_tree = proc_get_tree,
+ .reconfigure = proc_reconfigure,
+};
+
+static int proc_init_fs_context(struct fs_context *fc)
+{
+ struct proc_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct proc_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->pid_ns = get_pid_ns(task_active_pid_ns(current));
+ fc->fs_private = ctx;
+ fc->ops = &proc_fs_context_ops;
+ return 0;
}
static void proc_kill_sb(struct super_block *sb)
@@ -115,10 +207,11 @@ static void proc_kill_sb(struct super_block *sb)
}
static struct file_system_type proc_fs_type = {
- .name = "proc",
- .mount = proc_mount,
- .kill_sb = proc_kill_sb,
- .fs_flags = FS_USERNS_MOUNT,
+ .name = "proc",
+ .init_fs_context = proc_init_fs_context,
+ .parameters = &proc_fs_parameters,
+ .kill_sb = proc_kill_sb,
+ .fs_flags = FS_USERNS_MOUNT,
};
void __init proc_root_init(void)
@@ -156,7 +249,7 @@ static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentr
{
if (!proc_pid_lookup(dentry, flags))
return NULL;
-
+
return proc_lookup(dir, dentry, flags);
}
@@ -209,9 +302,28 @@ struct proc_dir_entry proc_root = {
int pid_ns_prepare_proc(struct pid_namespace *ns)
{
+ struct proc_fs_context *ctx;
+ struct fs_context *fc;
struct vfsmount *mnt;
- mnt = kern_mount_data(&proc_fs_type, ns);
+ fc = fs_context_for_mount(&proc_fs_type, SB_KERNMOUNT);
+ if (IS_ERR(fc))
+ return PTR_ERR(fc);
+
+ if (fc->user_ns != ns->user_ns) {
+ put_user_ns(fc->user_ns);
+ fc->user_ns = get_user_ns(ns->user_ns);
+ }
+
+ ctx = fc->fs_private;
+ if (ctx->pid_ns != ns) {
+ put_pid_ns(ctx->pid_ns);
+ get_pid_ns(ns);
+ ctx->pid_ns = ns;
+ }
+
+ mnt = fc_mount(fc);
+ put_fs_context(fc);
if (IS_ERR(mnt))
return PTR_ERR(mnt);
diff --git a/fs/read_write.c b/fs/read_write.c
index 30df848b7451..177ccc3d405a 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -478,8 +478,8 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t
return ret;
}
-ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
- loff_t *pos)
+static ssize_t __vfs_write(struct file *file, const char __user *p,
+ size_t count, loff_t *pos)
{
if (file->f_op->write)
return file->f_op->write(file, p, count, pos);
@@ -1238,6 +1238,9 @@ COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
const struct compat_iovec __user *,vec,
unsigned long, vlen, loff_t, pos, rwf_t, flags)
{
+ if (pos == -1)
+ return do_compat_readv(fd, vec, vlen, flags);
+
return do_compat_preadv64(fd, vec, vlen, pos, flags);
}
#endif
@@ -1344,6 +1347,9 @@ COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
const struct compat_iovec __user *,vec,
unsigned long, vlen, loff_t, pos, rwf_t, flags)
{
+ if (pos == -1)
+ return do_compat_writev(fd, vec, vlen, flags);
+
return do_compat_pwritev64(fd, vec, vlen, pos, flags);
}
#endif
diff --git a/fs/splice.c b/fs/splice.c
index 6489fb9436e4..3ee7e82df48f 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -138,7 +138,6 @@ error:
}
const struct pipe_buf_operations page_cache_pipe_buf_ops = {
- .can_merge = 0,
.confirm = page_cache_pipe_buf_confirm,
.release = page_cache_pipe_buf_release,
.steal = page_cache_pipe_buf_steal,
@@ -156,7 +155,6 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
}
static const struct pipe_buf_operations user_page_pipe_buf_ops = {
- .can_merge = 0,
.confirm = generic_pipe_buf_confirm,
.release = page_cache_pipe_buf_release,
.steal = user_page_pipe_buf_steal,
@@ -326,7 +324,6 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
EXPORT_SYMBOL(generic_file_splice_read);
const struct pipe_buf_operations default_pipe_buf_ops = {
- .can_merge = 0,
.confirm = generic_pipe_buf_confirm,
.release = generic_pipe_buf_release,
.steal = generic_pipe_buf_steal,
@@ -341,7 +338,6 @@ static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
/* Pipe buffer operations for a socket and similar. */
const struct pipe_buf_operations nosteal_pipe_buf_ops = {
- .can_merge = 0,
.confirm = generic_pipe_buf_confirm,
.release = generic_pipe_buf_release,
.steal = generic_pipe_buf_nosteal,
@@ -1606,6 +1602,8 @@ retry:
*/
obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
+ pipe_buf_mark_unmergeable(obuf);
+
obuf->len = len;
opipe->nrbufs++;
ibuf->offset += obuf->len;
@@ -1680,6 +1678,8 @@ static int link_pipe(struct pipe_inode_info *ipipe,
*/
obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
+ pipe_buf_mark_unmergeable(obuf);
+
if (obuf->len > len)
obuf->len = len;
diff --git a/fs/stat.c b/fs/stat.c
index adbfcd86c81b..c38e4c2e1221 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -45,11 +45,6 @@ void generic_fillattr(struct inode *inode, struct kstat *stat)
stat->ctime = inode->i_ctime;
stat->blksize = i_blocksize(inode);
stat->blocks = inode->i_blocks;
-
- if (IS_NOATIME(inode))
- stat->result_mask &= ~STATX_ATIME;
- if (IS_AUTOMOUNT(inode))
- stat->attributes |= STATX_ATTR_AUTOMOUNT;
}
EXPORT_SYMBOL(generic_fillattr);
@@ -75,6 +70,13 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
stat->result_mask |= STATX_BASIC_STATS;
request_mask &= STATX_ALL;
query_flags &= KSTAT_QUERY_FLAGS;
+
+ /* allow the fs to override these if it really wants to */
+ if (IS_NOATIME(inode))
+ stat->result_mask &= ~STATX_ATIME;
+ if (IS_AUTOMOUNT(inode))
+ stat->attributes |= STATX_ATTR_AUTOMOUNT;
+
if (inode->i_op->getattr)
return inode->i_op->getattr(path, stat, request_mask,
query_flags);
diff --git a/fs/super.c b/fs/super.c
index 48e25eba8465..583a0124bc39 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -35,6 +35,7 @@
#include <linux/fsnotify.h>
#include <linux/lockdep.h>
#include <linux/user_namespace.h>
+#include <linux/fs_context.h>
#include <uapi/linux/mount.h>
#include "internal.h"
@@ -476,6 +477,94 @@ void generic_shutdown_super(struct super_block *sb)
EXPORT_SYMBOL(generic_shutdown_super);
/**
+ * sget_fc - Find or create a superblock
+ * @fc: Filesystem context.
+ * @test: Comparison callback
+ * @set: Setup callback
+ *
+ * Find or create a superblock using the parameters stored in the filesystem
+ * context and the two callback functions.
+ *
+ * If an extant superblock is matched, then that will be returned with an
+ * elevated reference count that the caller must transfer or discard.
+ *
+ * If no match is made, a new superblock will be allocated and basic
+ * initialisation will be performed (s_type, s_fs_info and s_id will be set and
+ * the set() callback will be invoked), the superblock will be published and it
+ * will be returned in a partially constructed state with SB_BORN and SB_ACTIVE
+ * as yet unset.
+ */
+struct super_block *sget_fc(struct fs_context *fc,
+ int (*test)(struct super_block *, struct fs_context *),
+ int (*set)(struct super_block *, struct fs_context *))
+{
+ struct super_block *s = NULL;
+ struct super_block *old;
+ struct user_namespace *user_ns = fc->global ? &init_user_ns : fc->user_ns;
+ int err;
+
+ if (!(fc->sb_flags & SB_KERNMOUNT) &&
+ fc->purpose != FS_CONTEXT_FOR_SUBMOUNT) {
+ /* Don't allow mounting unless the caller has CAP_SYS_ADMIN
+ * over the namespace.
+ */
+ if (!(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) {
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+ } else {
+ if (!ns_capable(fc->user_ns, CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+ }
+ }
+
+retry:
+ spin_lock(&sb_lock);
+ if (test) {
+ hlist_for_each_entry(old, &fc->fs_type->fs_supers, s_instances) {
+ if (test(old, fc))
+ goto share_extant_sb;
+ }
+ }
+ if (!s) {
+ spin_unlock(&sb_lock);
+ s = alloc_super(fc->fs_type, fc->sb_flags, user_ns);
+ if (!s)
+ return ERR_PTR(-ENOMEM);
+ goto retry;
+ }
+
+ s->s_fs_info = fc->s_fs_info;
+ err = set(s, fc);
+ if (err) {
+ s->s_fs_info = NULL;
+ spin_unlock(&sb_lock);
+ destroy_unused_super(s);
+ return ERR_PTR(err);
+ }
+ fc->s_fs_info = NULL;
+ s->s_type = fc->fs_type;
+ strlcpy(s->s_id, s->s_type->name, sizeof(s->s_id));
+ list_add_tail(&s->s_list, &super_blocks);
+ hlist_add_head(&s->s_instances, &s->s_type->fs_supers);
+ spin_unlock(&sb_lock);
+ get_filesystem(s->s_type);
+ register_shrinker_prepared(&s->s_shrink);
+ return s;
+
+share_extant_sb:
+ if (user_ns != old->s_user_ns) {
+ spin_unlock(&sb_lock);
+ destroy_unused_super(s);
+ return ERR_PTR(-EBUSY);
+ }
+ if (!grab_super(old))
+ goto retry;
+ destroy_unused_super(s);
+ return old;
+}
+EXPORT_SYMBOL(sget_fc);
+
+/**
* sget_userns - find or create a superblock
* @type: filesystem type superblock should belong to
* @test: comparison callback
@@ -835,28 +924,35 @@ rescan:
}
/**
- * do_remount_sb - asks filesystem to change mount options.
- * @sb: superblock in question
- * @sb_flags: revised superblock flags
- * @data: the rest of options
- * @force: whether or not to force the change
+ * reconfigure_super - asks filesystem to change superblock parameters
+ * @fc: The superblock and configuration
*
- * Alters the mount options of a mounted file system.
+ * Alters the configuration parameters of a live superblock.
*/
-int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force)
+int reconfigure_super(struct fs_context *fc)
{
+ struct super_block *sb = fc->root->d_sb;
int retval;
- int remount_ro;
+ bool remount_ro = false;
+ bool force = fc->sb_flags & SB_FORCE;
+ if (fc->sb_flags_mask & ~MS_RMT_MASK)
+ return -EINVAL;
if (sb->s_writers.frozen != SB_UNFROZEN)
return -EBUSY;
+ retval = security_sb_remount(sb, fc->security);
+ if (retval)
+ return retval;
+
+ if (fc->sb_flags_mask & SB_RDONLY) {
#ifdef CONFIG_BLOCK
- if (!(sb_flags & SB_RDONLY) && bdev_read_only(sb->s_bdev))
- return -EACCES;
+ if (!(fc->sb_flags & SB_RDONLY) && bdev_read_only(sb->s_bdev))
+ return -EACCES;
#endif
- remount_ro = (sb_flags & SB_RDONLY) && !sb_rdonly(sb);
+ remount_ro = (fc->sb_flags & SB_RDONLY) && !sb_rdonly(sb);
+ }
if (remount_ro) {
if (!hlist_empty(&sb->s_pins)) {
@@ -867,13 +963,14 @@ int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force)
return 0;
if (sb->s_writers.frozen != SB_UNFROZEN)
return -EBUSY;
- remount_ro = (sb_flags & SB_RDONLY) && !sb_rdonly(sb);
+ remount_ro = !sb_rdonly(sb);
}
}
shrink_dcache_sb(sb);
- /* If we are remounting RDONLY and current sb is read/write,
- make sure there are no rw files opened */
+ /* If we are reconfiguring to RDONLY and current sb is read/write,
+ * make sure there are no files open for writing.
+ */
if (remount_ro) {
if (force) {
sb->s_readonly_remount = 1;
@@ -885,8 +982,8 @@ int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force)
}
}
- if (sb->s_op->remount_fs) {
- retval = sb->s_op->remount_fs(sb, &sb_flags, data);
+ if (fc->ops->reconfigure) {
+ retval = fc->ops->reconfigure(fc);
if (retval) {
if (!force)
goto cancel_readonly;
@@ -895,7 +992,9 @@ int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force)
sb->s_type->name, retval);
}
}
- sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (sb_flags & MS_RMT_MASK);
+
+ WRITE_ONCE(sb->s_flags, ((sb->s_flags & ~fc->sb_flags_mask) |
+ (fc->sb_flags & fc->sb_flags_mask)));
/* Needs to be ordered wrt mnt_is_readonly() */
smp_wmb();
sb->s_readonly_remount = 0;
@@ -922,10 +1021,15 @@ static void do_emergency_remount_callback(struct super_block *sb)
down_write(&sb->s_umount);
if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) &&
!sb_rdonly(sb)) {
- /*
- * What lock protects sb->s_flags??
- */
- do_remount_sb(sb, SB_RDONLY, NULL, 1);
+ struct fs_context *fc;
+
+ fc = fs_context_for_reconfigure(sb->s_root,
+ SB_RDONLY | SB_FORCE, SB_RDONLY);
+ if (!IS_ERR(fc)) {
+ if (parse_monolithic_mount_data(fc, NULL) == 0)
+ (void)reconfigure_super(fc);
+ put_fs_context(fc);
+ }
}
up_write(&sb->s_umount);
}
@@ -1087,6 +1191,89 @@ struct dentry *mount_ns(struct file_system_type *fs_type,
EXPORT_SYMBOL(mount_ns);
+int set_anon_super_fc(struct super_block *sb, struct fs_context *fc)
+{
+ return set_anon_super(sb, NULL);
+}
+EXPORT_SYMBOL(set_anon_super_fc);
+
+static int test_keyed_super(struct super_block *sb, struct fs_context *fc)
+{
+ return sb->s_fs_info == fc->s_fs_info;
+}
+
+static int test_single_super(struct super_block *s, struct fs_context *fc)
+{
+ return 1;
+}
+
+/**
+ * vfs_get_super - Get a superblock with a search key set in s_fs_info.
+ * @fc: The filesystem context holding the parameters
+ * @keying: How to distinguish superblocks
+ * @fill_super: Helper to initialise a new superblock
+ *
+ * Search for a superblock and create a new one if not found. The search
+ * criterion is controlled by @keying. If the search fails, a new superblock
+ * is created and @fill_super() is called to initialise it.
+ *
+ * @keying can take one of a number of values:
+ *
+ * (1) vfs_get_single_super - Only one superblock of this type may exist on the
+ * system. This is typically used for special system filesystems.
+ *
+ * (2) vfs_get_keyed_super - Multiple superblocks may exist, but they must have
+ * distinct keys (where the key is in s_fs_info). Searching for the same
+ * key again will turn up the superblock for that key.
+ *
+ * (3) vfs_get_independent_super - Multiple superblocks may exist and are
+ * unkeyed. Each call will get a new superblock.
+ *
+ * A permissions check is made by sget_fc() unless we're getting a superblock
+ * for a kernel-internal mount or a submount.
+ */
+int vfs_get_super(struct fs_context *fc,
+ enum vfs_get_super_keying keying,
+ int (*fill_super)(struct super_block *sb,
+ struct fs_context *fc))
+{
+ int (*test)(struct super_block *, struct fs_context *);
+ struct super_block *sb;
+
+ switch (keying) {
+ case vfs_get_single_super:
+ test = test_single_super;
+ break;
+ case vfs_get_keyed_super:
+ test = test_keyed_super;
+ break;
+ case vfs_get_independent_super:
+ test = NULL;
+ break;
+ default:
+ BUG();
+ }
+
+ sb = sget_fc(fc, test, set_anon_super_fc);
+ if (IS_ERR(sb))
+ return PTR_ERR(sb);
+
+ if (!sb->s_root) {
+ int err = fill_super(sb, fc);
+ if (err) {
+ deactivate_locked_super(sb);
+ return err;
+ }
+
+ sb->s_flags |= SB_ACTIVE;
+ }
+
+ BUG_ON(fc->root);
+ fc->root = dget(sb->s_root);
+ return 0;
+}
+EXPORT_SYMBOL(vfs_get_super);
+
#ifdef CONFIG_BLOCK
static int set_bdev_super(struct super_block *s, void *data)
{
@@ -1212,6 +1399,31 @@ struct dentry *mount_nodev(struct file_system_type *fs_type,
}
EXPORT_SYMBOL(mount_nodev);
+static int reconfigure_single(struct super_block *s,
+ int flags, void *data)
+{
+ struct fs_context *fc;
+ int ret;
+
+ /* The caller really need to be passing fc down into mount_single(),
+ * then a chunk of this can be removed. [Bollocks -- AV]
+ * Better yet, reconfiguration shouldn't happen, but rather the second
+ * mount should be rejected if the parameters are not compatible.
+ */
+ fc = fs_context_for_reconfigure(s->s_root, flags, MS_RMT_MASK);
+ if (IS_ERR(fc))
+ return PTR_ERR(fc);
+
+ ret = parse_monolithic_mount_data(fc, data);
+ if (ret < 0)
+ goto out;
+
+ ret = reconfigure_super(fc);
+out:
+ put_fs_context(fc);
+ return ret;
+}
+
static int compare_single(struct super_block *s, void *p)
{
return 1;
@@ -1229,41 +1441,64 @@ struct dentry *mount_single(struct file_system_type *fs_type,
return ERR_CAST(s);
if (!s->s_root) {
error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
- if (error) {
- deactivate_locked_super(s);
- return ERR_PTR(error);
- }
- s->s_flags |= SB_ACTIVE;
+ if (!error)
+ s->s_flags |= SB_ACTIVE;
} else {
- do_remount_sb(s, flags, data, 0);
+ error = reconfigure_single(s, flags, data);
+ }
+ if (unlikely(error)) {
+ deactivate_locked_super(s);
+ return ERR_PTR(error);
}
return dget(s->s_root);
}
EXPORT_SYMBOL(mount_single);
-struct dentry *
-mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
+/**
+ * vfs_get_tree - Get the mountable root
+ * @fc: The superblock configuration context.
+ *
+ * The filesystem is invoked to get or create a superblock which can then later
+ * be used for mounting. The filesystem places a pointer to the root to be
+ * used for mounting in @fc->root.
+ */
+int vfs_get_tree(struct fs_context *fc)
{
- struct dentry *root;
struct super_block *sb;
- int error = -ENOMEM;
- void *sec_opts = NULL;
+ int error;
- if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
- error = security_sb_eat_lsm_opts(data, &sec_opts);
- if (error)
- return ERR_PTR(error);
+ if (fc->fs_type->fs_flags & FS_REQUIRES_DEV && !fc->source) {
+ errorf(fc, "Filesystem requires source device");
+ return -ENOENT;
}
- root = type->mount(type, flags, name, data);
- if (IS_ERR(root)) {
- error = PTR_ERR(root);
- goto out_free_secdata;
+ if (fc->root)
+ return -EBUSY;
+
+ /* Get the mountable root in fc->root, with a ref on the root and a ref
+ * on the superblock.
+ */
+ error = fc->ops->get_tree(fc);
+ if (error < 0)
+ return error;
+
+ if (!fc->root) {
+ pr_err("Filesystem %s get_tree() didn't set fc->root\n",
+ fc->fs_type->name);
+ /* We don't know what the locking state of the superblock is -
+ * if there is a superblock.
+ */
+ BUG();
}
- sb = root->d_sb;
- BUG_ON(!sb);
+
+ sb = fc->root->d_sb;
WARN_ON(!sb->s_bdi);
+ if (fc->subtype && !sb->s_subtype) {
+ sb->s_subtype = fc->subtype;
+ fc->subtype = NULL;
+ }
+
/*
* Write barrier is for super_cache_count(). We place it before setting
* SB_BORN as the data dependency between the two functions is the
@@ -1273,14 +1508,10 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
smp_wmb();
sb->s_flags |= SB_BORN;
- error = security_sb_set_mnt_opts(sb, sec_opts, 0, NULL);
- if (error)
- goto out_sb;
-
- if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT))) {
- error = security_sb_kern_mount(sb);
- if (error)
- goto out_sb;
+ error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL);
+ if (unlikely(error)) {
+ fc_drop_locked(fc);
+ return error;
}
/*
@@ -1290,18 +1521,11 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
* violate this rule.
*/
WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
- "negative value (%lld)\n", type->name, sb->s_maxbytes);
+ "negative value (%lld)\n", fc->fs_type->name, sb->s_maxbytes);
- up_write(&sb->s_umount);
- security_free_mnt_opts(&sec_opts);
- return root;
-out_sb:
- dput(root);
- deactivate_locked_super(sb);
-out_free_secdata:
- security_free_mnt_opts(&sec_opts);
- return ERR_PTR(error);
+ return 0;
}
+EXPORT_SYMBOL(vfs_get_tree);
/*
* Setup private BDI for given superblock. It gets automatically cleaned up
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 92682fcc41f6..1b56686ab178 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -13,34 +13,71 @@
#include <linux/magic.h>
#include <linux/mount.h>
#include <linux/init.h>
+#include <linux/slab.h>
#include <linux/user_namespace.h>
+#include <linux/fs_context.h>
+#include <net/net_namespace.h>
#include "sysfs.h"
static struct kernfs_root *sysfs_root;
struct kernfs_node *sysfs_root_kn;
-static struct dentry *sysfs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int sysfs_get_tree(struct fs_context *fc)
{
- struct dentry *root;
- void *ns;
- bool new_sb = false;
+ struct kernfs_fs_context *kfc = fc->fs_private;
+ int ret;
- if (!(flags & SB_KERNMOUNT)) {
+ ret = kernfs_get_tree(fc);
+ if (ret)
+ return ret;
+
+ if (kfc->new_sb_created)
+ fc->root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE;
+ return 0;
+}
+
+static void sysfs_fs_context_free(struct fs_context *fc)
+{
+ struct kernfs_fs_context *kfc = fc->fs_private;
+
+ if (kfc->ns_tag)
+ kobj_ns_drop(KOBJ_NS_TYPE_NET, kfc->ns_tag);
+ kernfs_free_fs_context(fc);
+ kfree(kfc);
+}
+
+static const struct fs_context_operations sysfs_fs_context_ops = {
+ .free = sysfs_fs_context_free,
+ .get_tree = sysfs_get_tree,
+};
+
+static int sysfs_init_fs_context(struct fs_context *fc)
+{
+ struct kernfs_fs_context *kfc;
+ struct net *netns;
+
+ if (!(fc->sb_flags & SB_KERNMOUNT)) {
if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET))
- return ERR_PTR(-EPERM);
+ return -EPERM;
}
- ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
- root = kernfs_mount_ns(fs_type, flags, sysfs_root,
- SYSFS_MAGIC, &new_sb, ns);
- if (!new_sb)
- kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
- else if (!IS_ERR(root))
- root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE;
+ kfc = kzalloc(sizeof(struct kernfs_fs_context), GFP_KERNEL);
+ if (!kfc)
+ return -ENOMEM;
- return root;
+ kfc->ns_tag = netns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
+ kfc->root = sysfs_root;
+ kfc->magic = SYSFS_MAGIC;
+ fc->fs_private = kfc;
+ fc->ops = &sysfs_fs_context_ops;
+ if (netns) {
+ if (fc->user_ns)
+ put_user_ns(fc->user_ns);
+ fc->user_ns = get_user_ns(netns->user_ns);
+ }
+ fc->global = true;
+ return 0;
}
static void sysfs_kill_sb(struct super_block *sb)
@@ -52,10 +89,10 @@ static void sysfs_kill_sb(struct super_block *sb)
}
static struct file_system_type sysfs_fs_type = {
- .name = "sysfs",
- .mount = sysfs_mount,
- .kill_sb = sysfs_kill_sb,
- .fs_flags = FS_USERNS_MOUNT,
+ .name = "sysfs",
+ .init_fs_context = sysfs_init_fs_context,
+ .kill_sb = sysfs_kill_sb,
+ .fs_flags = FS_USERNS_MOUNT,
};
int __init sysfs_init(void)
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 0f9c362a3402..82e4e6a30b04 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -28,6 +28,11 @@
#include <linux/mount.h>
#include "ubifs.h"
+/* Need to be kept consistent with checked flags in ioctl2ubifs() */
+#define UBIFS_SUPPORTED_IOCTL_FLAGS \
+ (FS_COMPR_FL | FS_SYNC_FL | FS_APPEND_FL | \
+ FS_IMMUTABLE_FL | FS_DIRSYNC_FL)
+
/**
* ubifs_set_inode_flags - set VFS inode flags.
* @inode: VFS inode to set flags for
@@ -169,6 +174,9 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
if (get_user(flags, (int __user *) arg))
return -EFAULT;
+ if (flags & ~UBIFS_SUPPORTED_IOCTL_FLAGS)
+ return -EOPNOTSUPP;
+
if (!S_ISDIR(inode->i_mode))
flags &= ~FS_DIRSYNC_FL;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index ae796e10f68b..e7276932e433 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1242,8 +1242,10 @@ set_size:
truncate_setsize(inode, newsize);
down_write(&iinfo->i_data_sem);
udf_clear_extent_cache(inode);
- udf_truncate_extents(inode);
+ err = udf_truncate_extents(inode);
up_write(&iinfo->i_data_sem);
+ if (err)
+ return err;
}
update_time:
inode->i_mtime = inode->i_ctime = current_time(inode);
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index b647f0bd150c..63a47f1e1d52 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -199,7 +199,7 @@ static void udf_update_alloc_ext_desc(struct inode *inode,
* for making file shorter. For making file longer, udf_extend_file() has to
* be used.
*/
-void udf_truncate_extents(struct inode *inode)
+int udf_truncate_extents(struct inode *inode)
{
struct extent_position epos;
struct kernel_lb_addr eloc, neloc = {};
@@ -224,7 +224,7 @@ void udf_truncate_extents(struct inode *inode)
if (etype == -1) {
/* We should extend the file? */
WARN_ON(byte_offset);
- return;
+ return 0;
}
epos.offset -= adsize;
extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset);
@@ -260,6 +260,9 @@ void udf_truncate_extents(struct inode *inode)
epos.block = eloc;
epos.bh = udf_tread(sb,
udf_get_lb_pblock(sb, &eloc, 0));
+ /* Error reading indirect block? */
+ if (!epos.bh)
+ return -EIO;
if (elen)
indirect_ext_len =
(elen + sb->s_blocksize - 1) >>
@@ -283,4 +286,5 @@ void udf_truncate_extents(struct inode *inode)
iinfo->i_lenExtents = inode->i_size;
brelse(epos.bh);
+ return 0;
}
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index ee246769dee4..d89ef71887fc 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -235,7 +235,7 @@ extern struct inode *udf_new_inode(struct inode *, umode_t);
/* truncate.c */
extern void udf_truncate_tail_extent(struct inode *);
extern void udf_discard_prealloc(struct inode *);
-extern void udf_truncate_extents(struct inode *);
+extern int udf_truncate_extents(struct inode *);
/* balloc.c */
extern void udf_free_blocks(struct super_block *, struct inode *,
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index 9a3767818c50..9c2a0a13ed61 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -563,43 +563,40 @@ xfs_dir3_leaf_find_entry(
*/
int /* error */
xfs_dir2_leaf_addname(
- xfs_da_args_t *args) /* operation arguments */
+ struct xfs_da_args *args) /* operation arguments */
{
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_trans *tp = args->trans;
__be16 *bestsp; /* freespace table in leaf */
- int compact; /* need to compact leaves */
- xfs_dir2_data_hdr_t *hdr; /* data block header */
+ __be16 *tagp; /* end of data entry */
struct xfs_buf *dbp; /* data block buffer */
- xfs_dir2_data_entry_t *dep; /* data block entry */
- xfs_inode_t *dp; /* incore directory inode */
- xfs_dir2_data_unused_t *dup; /* data unused entry */
+ struct xfs_buf *lbp; /* leaf's buffer */
+ struct xfs_dir2_leaf *leaf; /* leaf structure */
+ struct xfs_inode *dp = args->dp; /* incore directory inode */
+ struct xfs_dir2_data_hdr *hdr; /* data block header */
+ struct xfs_dir2_data_entry *dep; /* data block entry */
+ struct xfs_dir2_leaf_entry *lep; /* leaf entry table pointer */
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir2_data_unused *dup; /* data unused entry */
+ struct xfs_dir2_leaf_tail *ltp; /* leaf tail pointer */
+ struct xfs_dir2_data_free *bf; /* bestfree table */
+ int compact; /* need to compact leaves */
int error; /* error return value */
int grown; /* allocated new data block */
- int highstale; /* index of next stale leaf */
+ int highstale = 0; /* index of next stale leaf */
int i; /* temporary, index */
int index; /* leaf table position */
- struct xfs_buf *lbp; /* leaf's buffer */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
int length; /* length of new entry */
- xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */
int lfloglow; /* low leaf logging index */
int lfloghigh; /* high leaf logging index */
- int lowstale; /* index of prev stale leaf */
- xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */
+ int lowstale = 0; /* index of prev stale leaf */
int needbytes; /* leaf block bytes needed */
int needlog; /* need to log data header */
int needscan; /* need to rescan data free */
- __be16 *tagp; /* end of data entry */
- xfs_trans_t *tp; /* transaction pointer */
xfs_dir2_db_t use_block; /* data block number */
- struct xfs_dir2_data_free *bf; /* bestfree table */
- struct xfs_dir2_leaf_entry *ents;
- struct xfs_dir3_icleaf_hdr leafhdr;
trace_xfs_dir2_leaf_addname(args);
- dp = args->dp;
- tp = args->trans;
-
error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index 3b03703c5c3d..16731d2d684b 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -426,24 +426,22 @@ xfs_dir2_leaf_to_node(
static int /* error */
xfs_dir2_leafn_add(
struct xfs_buf *bp, /* leaf buffer */
- xfs_da_args_t *args, /* operation arguments */
+ struct xfs_da_args *args, /* operation arguments */
int index) /* insertion pt for new entry */
{
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
+ struct xfs_dir2_leaf_entry *lep;
+ struct xfs_dir2_leaf_entry *ents;
int compact; /* compacting stale leaves */
- xfs_inode_t *dp; /* incore directory inode */
- int highstale; /* next stale entry */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
- xfs_dir2_leaf_entry_t *lep; /* leaf entry */
+ int highstale = 0; /* next stale entry */
int lfloghigh; /* high leaf entry logging */
int lfloglow; /* low leaf entry logging */
- int lowstale; /* previous stale entry */
- struct xfs_dir3_icleaf_hdr leafhdr;
- struct xfs_dir2_leaf_entry *ents;
+ int lowstale = 0; /* previous stale entry */
trace_xfs_dir2_leafn_add(args, index);
- dp = args->dp;
- leaf = bp->b_addr;
dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
ents = dp->d_ops->leaf_ents_p(leaf);