summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs_vfs.h23
-rw-r--r--fs/9p/vfs_file.c6
-rw-r--r--fs/9p/vfs_inode.c23
-rw-r--r--fs/9p/vfs_inode_dotl.c27
-rw-r--r--fs/9p/vfs_super.c4
-rw-r--r--fs/afs/callback.c3
-rw-r--r--fs/afs/cmservice.c2
-rw-r--r--fs/afs/fsclient.c6
-rw-r--r--fs/afs/inode.c4
-rw-r--r--fs/afs/internal.h4
-rw-r--r--fs/afs/rxrpc.c30
-rw-r--r--fs/afs/server.c1
-rw-r--r--fs/afs/write.c1
-rw-r--r--fs/afs/yfsclient.c2
-rw-r--r--fs/aio.c338
-rw-r--r--fs/block_dev.c20
-rw-r--r--fs/btrfs/extent-tree.c2
-rw-r--r--fs/btrfs/ioctl.c10
-rw-r--r--fs/btrfs/props.c8
-rw-r--r--fs/btrfs/qgroup.c4
-rw-r--r--fs/btrfs/raid56.c3
-rw-r--r--fs/btrfs/transaction.c49
-rw-r--r--fs/btrfs/tree-log.c33
-rw-r--r--fs/btrfs/volumes.c2
-rw-r--r--fs/ceph/inode.c2
-rw-r--r--fs/cifs/cifs_debug.c11
-rw-r--r--fs/cifs/cifs_ioctl.h3
-rw-r--r--fs/cifs/cifsfs.c4
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h26
-rw-r--r--fs/cifs/connect.c34
-rw-r--r--fs/cifs/dir.c107
-rw-r--r--fs/cifs/file.c192
-rw-r--r--fs/cifs/misc.c25
-rw-r--r--fs/cifs/smb1ops.c126
-rw-r--r--fs/cifs/smb2file.c6
-rw-r--r--fs/cifs/smb2inode.c87
-rw-r--r--fs/cifs/smb2maperror.c3
-rw-r--r--fs/cifs/smb2misc.c6
-rw-r--r--fs/cifs/smb2ops.c459
-rw-r--r--fs/cifs/smb2pdu.c208
-rw-r--r--fs/cifs/smb2pdu.h7
-rw-r--r--fs/cifs/smb2proto.h10
-rw-r--r--fs/cifs/smb2status.h6
-rw-r--r--fs/cifs/trace.h130
-rw-r--r--fs/cifs/transport.c226
-rw-r--r--fs/dax.c15
-rw-r--r--fs/debugfs/inode.c13
-rw-r--r--fs/ext4/ext4_jbd2.h2
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/indirect.c43
-rw-r--r--fs/ext4/inode.c30
-rw-r--r--fs/ext4/ioctl.c7
-rw-r--r--fs/ext4/resize.c17
-rw-r--r--fs/ext4/super.c16
-rw-r--r--fs/f2fs/checkpoint.c20
-rw-r--r--fs/f2fs/data.c59
-rw-r--r--fs/f2fs/debug.c19
-rw-r--r--fs/f2fs/dir.c15
-rw-r--r--fs/f2fs/extent_cache.c2
-rw-r--r--fs/f2fs/f2fs.h77
-rw-r--r--fs/f2fs/file.c46
-rw-r--r--fs/f2fs/inline.c12
-rw-r--r--fs/f2fs/inode.c15
-rw-r--r--fs/f2fs/namei.c3
-rw-r--r--fs/f2fs/node.c6
-rw-r--r--fs/f2fs/segment.c80
-rw-r--r--fs/f2fs/segment.h2
-rw-r--r--fs/f2fs/super.c109
-rw-r--r--fs/f2fs/sysfs.c17
-rw-r--r--fs/f2fs/trace.c20
-rw-r--r--fs/f2fs/xattr.c25
-rw-r--r--fs/f2fs/xattr.h6
-rw-r--r--fs/fs_parser.c2
-rw-r--r--fs/fuse/dev.c12
-rw-r--r--fs/hugetlbfs/inode.c20
-rw-r--r--fs/io_uring.c511
-rw-r--r--fs/iomap.c12
-rw-r--r--fs/jffs2/readinode.c5
-rw-r--r--fs/jffs2/super.c5
-rw-r--r--fs/lockd/host.c3
-rw-r--r--fs/locks.c5
-rw-r--r--fs/nfs/client.c2
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c5
-rw-r--r--fs/nfs/nfs42proc.c3
-rw-r--r--fs/nfs/nfs4file.c4
-rw-r--r--fs/nfs/nfs4proc.c5
-rw-r--r--fs/nfs/nfs4xdr.c4
-rw-r--r--fs/nfs/pnfs.c2
-rw-r--r--fs/nfs/super.c3
-rw-r--r--fs/nfsd/nfs3proc.c17
-rw-r--r--fs/nfsd/nfs3xdr.c11
-rw-r--r--fs/nfsd/nfs4callback.c8
-rw-r--r--fs/nfsd/nfs4state.c12
-rw-r--r--fs/nfsd/state.h1
-rw-r--r--fs/notify/fanotify/fanotify_user.c12
-rw-r--r--fs/notify/inotify/inotify_user.c7
-rw-r--r--fs/ocfs2/refcounttree.c42
-rw-r--r--fs/open.c24
-rw-r--r--fs/pipe.c4
-rw-r--r--fs/proc/base.c26
-rw-r--r--fs/proc/kcore.c29
-rw-r--r--fs/proc/proc_sysctl.c3
-rw-r--r--fs/proc/task_mmu.c18
-rw-r--r--fs/read_write.c5
-rw-r--r--fs/splice.c12
-rw-r--r--fs/sysfs/mount.c8
-rw-r--r--fs/ubifs/super.c4
-rw-r--r--fs/udf/inode.c4
-rw-r--r--fs/udf/truncate.c8
-rw-r--r--fs/udf/udfdecl.h2
-rw-r--r--fs/userfaultfd.c9
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c15
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c37
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c18
-rw-r--r--fs/xfs/scrub/btree.c11
-rw-r--r--fs/xfs/scrub/dabtree.c5
-rw-r--r--fs/xfs/xfs_discard.c8
-rw-r--r--fs/xfs/xfs_file.c27
119 files changed, 2606 insertions, 1347 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 5a0db6dec8d1..aaee1e6584e6 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -40,6 +40,9 @@
*/
#define P9_LOCK_TIMEOUT (30*HZ)
+/* flags for v9fs_stat2inode() & v9fs_stat2inode_dotl() */
+#define V9FS_STAT2INODE_KEEP_ISIZE 1
+
extern struct file_system_type v9fs_fs_type;
extern const struct address_space_operations v9fs_addr_operations;
extern const struct file_operations v9fs_file_operations;
@@ -61,8 +64,10 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
struct inode *inode, umode_t mode, dev_t);
void v9fs_evict_inode(struct inode *inode);
ino_t v9fs_qid2ino(struct p9_qid *qid);
-void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
-void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *);
+void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
+ struct super_block *sb, unsigned int flags);
+void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
+ unsigned int flags);
int v9fs_dir_release(struct inode *inode, struct file *filp);
int v9fs_file_open(struct inode *inode, struct file *file);
void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
@@ -83,4 +88,18 @@ static inline void v9fs_invalidate_inode_attr(struct inode *inode)
}
int v9fs_open_to_dotl_flags(int flags);
+
+static inline void v9fs_i_size_write(struct inode *inode, loff_t i_size)
+{
+ /*
+ * 32-bit need the lock, concurrent updates could break the
+ * sequences and make i_size_read() loop forever.
+ * 64-bit updates are atomic and can skip the locking.
+ */
+ if (sizeof(i_size) > sizeof(long))
+ spin_lock(&inode->i_lock);
+ i_size_write(inode, i_size);
+ if (sizeof(i_size) > sizeof(long))
+ spin_unlock(&inode->i_lock);
+}
#endif
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index a25efa782fcc..9a1125305d84 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -446,7 +446,11 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
i_size = i_size_read(inode);
if (iocb->ki_pos > i_size) {
inode_add_bytes(inode, iocb->ki_pos - i_size);
- i_size_write(inode, iocb->ki_pos);
+ /*
+ * Need to serialize against i_size_write() in
+ * v9fs_stat2inode()
+ */
+ v9fs_i_size_write(inode, iocb->ki_pos);
}
return retval;
}
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 85ff859d3af5..72b779bc0942 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -538,7 +538,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
if (retval)
goto error;
- v9fs_stat2inode(st, inode, sb);
+ v9fs_stat2inode(st, inode, sb, 0);
v9fs_cache_inode_get_cookie(inode);
unlock_new_inode(inode);
return inode;
@@ -1092,7 +1092,7 @@ v9fs_vfs_getattr(const struct path *path, struct kstat *stat,
if (IS_ERR(st))
return PTR_ERR(st);
- v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb);
+ v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb, 0);
generic_fillattr(d_inode(dentry), stat);
p9stat_free(st);
@@ -1170,12 +1170,13 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
* @stat: Plan 9 metadata (mistat) structure
* @inode: inode to populate
* @sb: superblock of filesystem
+ * @flags: control flags (e.g. V9FS_STAT2INODE_KEEP_ISIZE)
*
*/
void
v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
- struct super_block *sb)
+ struct super_block *sb, unsigned int flags)
{
umode_t mode;
char ext[32];
@@ -1216,10 +1217,11 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
mode = p9mode2perm(v9ses, stat);
mode |= inode->i_mode & ~S_IALLUGO;
inode->i_mode = mode;
- i_size_write(inode, stat->length);
+ if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE))
+ v9fs_i_size_write(inode, stat->length);
/* not real number of blocks, but 512 byte ones ... */
- inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
+ inode->i_blocks = (stat->length + 512 - 1) >> 9;
v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR;
}
@@ -1416,9 +1418,9 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
{
int umode;
dev_t rdev;
- loff_t i_size;
struct p9_wstat *st;
struct v9fs_session_info *v9ses;
+ unsigned int flags;
v9ses = v9fs_inode2v9ses(inode);
st = p9_client_stat(fid);
@@ -1431,16 +1433,13 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
if ((inode->i_mode & S_IFMT) != (umode & S_IFMT))
goto out;
- spin_lock(&inode->i_lock);
/*
* We don't want to refresh inode->i_size,
* because we may have cached data
*/
- i_size = inode->i_size;
- v9fs_stat2inode(st, inode, inode->i_sb);
- if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
- inode->i_size = i_size;
- spin_unlock(&inode->i_lock);
+ flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ?
+ V9FS_STAT2INODE_KEEP_ISIZE : 0;
+ v9fs_stat2inode(st, inode, inode->i_sb, flags);
out:
p9stat_free(st);
kfree(st);
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 4823e1c46999..a950a927a626 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -143,7 +143,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
if (retval)
goto error;
- v9fs_stat2inode_dotl(st, inode);
+ v9fs_stat2inode_dotl(st, inode, 0);
v9fs_cache_inode_get_cookie(inode);
retval = v9fs_get_acl(inode, fid);
if (retval)
@@ -496,7 +496,7 @@ v9fs_vfs_getattr_dotl(const struct path *path, struct kstat *stat,
if (IS_ERR(st))
return PTR_ERR(st);
- v9fs_stat2inode_dotl(st, d_inode(dentry));
+ v9fs_stat2inode_dotl(st, d_inode(dentry), 0);
generic_fillattr(d_inode(dentry), stat);
/* Change block size to what the server returned */
stat->blksize = st->st_blksize;
@@ -607,11 +607,13 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
* v9fs_stat2inode_dotl - populate an inode structure with stat info
* @stat: stat structure
* @inode: inode to populate
+ * @flags: ctrl flags (e.g. V9FS_STAT2INODE_KEEP_ISIZE)
*
*/
void
-v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
+v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
+ unsigned int flags)
{
umode_t mode;
struct v9fs_inode *v9inode = V9FS_I(inode);
@@ -631,7 +633,8 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
mode |= inode->i_mode & ~S_IALLUGO;
inode->i_mode = mode;
- i_size_write(inode, stat->st_size);
+ if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE))
+ v9fs_i_size_write(inode, stat->st_size);
inode->i_blocks = stat->st_blocks;
} else {
if (stat->st_result_mask & P9_STATS_ATIME) {
@@ -661,8 +664,9 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
}
if (stat->st_result_mask & P9_STATS_RDEV)
inode->i_rdev = new_decode_dev(stat->st_rdev);
- if (stat->st_result_mask & P9_STATS_SIZE)
- i_size_write(inode, stat->st_size);
+ if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) &&
+ stat->st_result_mask & P9_STATS_SIZE)
+ v9fs_i_size_write(inode, stat->st_size);
if (stat->st_result_mask & P9_STATS_BLOCKS)
inode->i_blocks = stat->st_blocks;
}
@@ -928,9 +932,9 @@ v9fs_vfs_get_link_dotl(struct dentry *dentry,
int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
{
- loff_t i_size;
struct p9_stat_dotl *st;
struct v9fs_session_info *v9ses;
+ unsigned int flags;
v9ses = v9fs_inode2v9ses(inode);
st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
@@ -942,16 +946,13 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT))
goto out;
- spin_lock(&inode->i_lock);
/*
* We don't want to refresh inode->i_size,
* because we may have cached data
*/
- i_size = inode->i_size;
- v9fs_stat2inode_dotl(st, inode);
- if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
- inode->i_size = i_size;
- spin_unlock(&inode->i_lock);
+ flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ?
+ V9FS_STAT2INODE_KEEP_ISIZE : 0;
+ v9fs_stat2inode_dotl(st, inode, flags);
out:
kfree(st);
return 0;
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 10d3bd3f534b..d13d35cf69c0 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -172,7 +172,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
goto release_sb;
}
d_inode(root)->i_ino = v9fs_qid2ino(&st->qid);
- v9fs_stat2inode_dotl(st, d_inode(root));
+ v9fs_stat2inode_dotl(st, d_inode(root), 0);
kfree(st);
} else {
struct p9_wstat *st = NULL;
@@ -183,7 +183,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
}
d_inode(root)->i_ino = v9fs_qid2ino(&st->qid);
- v9fs_stat2inode(st, d_inode(root), sb);
+ v9fs_stat2inode(st, d_inode(root), sb, 0);
p9stat_free(st);
kfree(st);
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 1c7955f5cdaf..128f2dbe256a 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -203,8 +203,7 @@ void afs_put_cb_interest(struct afs_net *net, struct afs_cb_interest *cbi)
*/
void afs_init_callback_state(struct afs_server *server)
{
- if (!test_and_clear_bit(AFS_SERVER_FL_NEW, &server->flags))
- server->cb_s_break++;
+ server->cb_s_break++;
}
/*
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 8ee5972893ed..2f8acb4c556d 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -34,7 +34,7 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *);
static int afs_deliver_yfs_cb_callback(struct afs_call *);
#define CM_NAME(name) \
- const char afs_SRXCB##name##_name[] __tracepoint_string = \
+ char afs_SRXCB##name##_name[] __tracepoint_string = \
"CB." #name
/*
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index ca08c83168f5..0b37867b5c20 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -1515,8 +1515,8 @@ static int afs_fs_setattr_size64(struct afs_fs_cursor *fc, struct iattr *attr)
xdr_encode_AFS_StoreStatus(&bp, attr);
- *bp++ = 0; /* position of start of write */
- *bp++ = 0;
+ *bp++ = htonl(attr->ia_size >> 32); /* position of start of write */
+ *bp++ = htonl((u32) attr->ia_size);
*bp++ = 0; /* size of write */
*bp++ = 0;
*bp++ = htonl(attr->ia_size >> 32); /* new file length */
@@ -1564,7 +1564,7 @@ static int afs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr)
xdr_encode_AFS_StoreStatus(&bp, attr);
- *bp++ = 0; /* position of start of write */
+ *bp++ = htonl(attr->ia_size); /* position of start of write */
*bp++ = 0; /* size of write */
*bp++ = htonl(attr->ia_size); /* new file length */
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 1a4ce07fb406..9cedc3fc1b77 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -216,9 +216,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
set_nlink(inode, 2);
inode->i_uid = GLOBAL_ROOT_UID;
inode->i_gid = GLOBAL_ROOT_GID;
- inode->i_ctime.tv_sec = get_seconds();
- inode->i_ctime.tv_nsec = 0;
- inode->i_atime = inode->i_mtime = inode->i_ctime;
+ inode->i_ctime = inode->i_atime = inode->i_mtime = current_time(inode);
inode->i_blocks = 0;
inode_set_iversion_raw(inode, 0);
inode->i_generation = 0;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index bb1f244b2b3a..3904ab0b9563 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -474,7 +474,6 @@ struct afs_server {
time64_t put_time; /* Time at which last put */
time64_t update_at; /* Time at which to next update the record */
unsigned long flags;
-#define AFS_SERVER_FL_NEW 0 /* New server, don't inc cb_s_break */
#define AFS_SERVER_FL_NOT_READY 1 /* The record is not ready for use */
#define AFS_SERVER_FL_NOT_FOUND 2 /* VL server says no such server */
#define AFS_SERVER_FL_VL_FAIL 3 /* Failed to access VL server */
@@ -827,7 +826,7 @@ static inline struct afs_cb_interest *afs_get_cb_interest(struct afs_cb_interest
static inline unsigned int afs_calc_vnode_cb_break(struct afs_vnode *vnode)
{
- return vnode->cb_break + vnode->cb_s_break + vnode->cb_v_break;
+ return vnode->cb_break + vnode->cb_v_break;
}
static inline bool afs_cb_is_broken(unsigned int cb_break,
@@ -835,7 +834,6 @@ static inline bool afs_cb_is_broken(unsigned int cb_break,
const struct afs_cb_interest *cbi)
{
return !cbi || cb_break != (vnode->cb_break +
- cbi->server->cb_s_break +
vnode->volume->cb_v_break);
}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 2c588f9bbbda..15c7e82d80cb 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -572,13 +572,17 @@ static void afs_deliver_to_call(struct afs_call *call)
case -ENODATA:
case -EBADMSG:
case -EMSGSIZE:
- default:
abort_code = RXGEN_CC_UNMARSHAL;
if (state != AFS_CALL_CL_AWAIT_REPLY)
abort_code = RXGEN_SS_UNMARSHAL;
rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
abort_code, ret, "KUM");
goto local_abort;
+ default:
+ abort_code = RX_USER_ABORT;
+ rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
+ abort_code, ret, "KER");
+ goto local_abort;
}
}
@@ -610,6 +614,7 @@ static long afs_wait_for_call_to_complete(struct afs_call *call,
bool stalled = false;
u64 rtt;
u32 life, last_life;
+ bool rxrpc_complete = false;
DECLARE_WAITQUEUE(myself, current);
@@ -621,7 +626,7 @@ static long afs_wait_for_call_to_complete(struct afs_call *call,
rtt2 = 2;
timeout = rtt2;
- last_life = rxrpc_kernel_check_life(call->net->socket, call->rxcall);
+ rxrpc_kernel_check_life(call->net->socket, call->rxcall, &last_life);
add_wait_queue(&call->waitq, &myself);
for (;;) {
@@ -639,7 +644,12 @@ static long afs_wait_for_call_to_complete(struct afs_call *call,
if (afs_check_call_state(call, AFS_CALL_COMPLETE))
break;
- life = rxrpc_kernel_check_life(call->net->socket, call->rxcall);
+ if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall, &life)) {
+ /* rxrpc terminated the call. */
+ rxrpc_complete = true;
+ break;
+ }
+
if (timeout == 0 &&
life == last_life && signal_pending(current)) {
if (stalled)
@@ -663,12 +673,16 @@ static long afs_wait_for_call_to_complete(struct afs_call *call,
remove_wait_queue(&call->waitq, &myself);
__set_current_state(TASK_RUNNING);
- /* Kill off the call if it's still live. */
if (!afs_check_call_state(call, AFS_CALL_COMPLETE)) {
- _debug("call interrupted");
- if (rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
- RX_USER_ABORT, -EINTR, "KWI"))
- afs_set_call_complete(call, -EINTR, 0);
+ if (rxrpc_complete) {
+ afs_set_call_complete(call, call->error, call->abort_code);
+ } else {
+ /* Kill off the call if it's still live. */
+ _debug("call interrupted");
+ if (rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
+ RX_USER_ABORT, -EINTR, "KWI"))
+ afs_set_call_complete(call, -EINTR, 0);
+ }
}
spin_lock_bh(&call->state_lock);
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 642afa2e9783..65b33b6da48b 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -226,7 +226,6 @@ static struct afs_server *afs_alloc_server(struct afs_net *net,
RCU_INIT_POINTER(server->addresses, alist);
server->addr_version = alist->version;
server->uuid = *uuid;
- server->flags = (1UL << AFS_SERVER_FL_NEW);
server->update_at = ktime_get_real_seconds() + afs_server_update_delay;
rwlock_init(&server->fs_lock);
INIT_HLIST_HEAD(&server->cb_volumes);
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 72efcfcf9f95..0122d7445fba 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -264,6 +264,7 @@ static void afs_kill_pages(struct address_space *mapping,
first = page->index + 1;
lock_page(page);
generic_error_remove_page(mapping, page);
+ unlock_page(page);
}
__pagevec_release(&pv);
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index 5aa57929e8c2..6e97a42d24d1 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -1514,7 +1514,7 @@ static int yfs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr)
bp = xdr_encode_u32(bp, 0); /* RPC flags */
bp = xdr_encode_YFSFid(bp, &vnode->fid);
bp = xdr_encode_YFS_StoreStatus(bp, attr);
- bp = xdr_encode_u64(bp, 0); /* position of start of write */
+ bp = xdr_encode_u64(bp, attr->ia_size); /* position of start of write */
bp = xdr_encode_u64(bp, 0); /* size of write */
bp = xdr_encode_u64(bp, attr->ia_size); /* new file length */
yfs_check_req(call, bp);
diff --git a/fs/aio.c b/fs/aio.c
index 38b741aef0bf..3490d1fa0e16 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -181,7 +181,7 @@ struct poll_iocb {
struct file *file;
struct wait_queue_head *head;
__poll_t events;
- bool woken;
+ bool done;
bool cancelled;
struct wait_queue_entry wait;
struct work_struct work;
@@ -204,8 +204,7 @@ struct aio_kiocb {
struct kioctx *ki_ctx;
kiocb_cancel_fn *ki_cancel;
- struct iocb __user *ki_user_iocb; /* user's aiocb */
- __u64 ki_user_data; /* user's data for completion */
+ struct io_event ki_res;
struct list_head ki_list; /* the aio core uses this
* for cancellation */
@@ -1022,6 +1021,9 @@ static bool get_reqs_available(struct kioctx *ctx)
/* aio_get_req
* Allocate a slot for an aio request.
* Returns NULL if no requests are free.
+ *
+ * The refcount is initialized to 2 - one for the async op completion,
+ * one for the synchronous code that does this.
*/
static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
{
@@ -1031,10 +1033,15 @@ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
if (unlikely(!req))
return NULL;
+ if (unlikely(!get_reqs_available(ctx))) {
+ kmem_cache_free(kiocb_cachep, req);
+ return NULL;
+ }
+
percpu_ref_get(&ctx->reqs);
req->ki_ctx = ctx;
INIT_LIST_HEAD(&req->ki_list);
- refcount_set(&req->ki_refcnt, 0);
+ refcount_set(&req->ki_refcnt, 2);
req->ki_eventfd = NULL;
return req;
}
@@ -1067,30 +1074,20 @@ out:
return ret;
}
-static inline void iocb_put(struct aio_kiocb *iocb)
-{
- if (refcount_read(&iocb->ki_refcnt) == 0 ||
- refcount_dec_and_test(&iocb->ki_refcnt)) {
- if (iocb->ki_filp)
- fput(iocb->ki_filp);
- percpu_ref_put(&iocb->ki_ctx->reqs);
- kmem_cache_free(kiocb_cachep, iocb);
- }
-}
-
-static void aio_fill_event(struct io_event *ev, struct aio_kiocb *iocb,
- long res, long res2)
+static inline void iocb_destroy(struct aio_kiocb *iocb)
{
- ev->obj = (u64)(unsigned long)iocb->ki_user_iocb;
- ev->data = iocb->ki_user_data;
- ev->res = res;
- ev->res2 = res2;
+ if (iocb->ki_eventfd)
+ eventfd_ctx_put(iocb->ki_eventfd);
+ if (iocb->ki_filp)
+ fput(iocb->ki_filp);
+ percpu_ref_put(&iocb->ki_ctx->reqs);
+ kmem_cache_free(kiocb_cachep, iocb);
}
/* aio_complete
* Called when the io request on the given iocb is complete.
*/
-static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
+static void aio_complete(struct aio_kiocb *iocb)
{
struct kioctx *ctx = iocb->ki_ctx;
struct aio_ring *ring;
@@ -1114,14 +1111,14 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
event = ev_page + pos % AIO_EVENTS_PER_PAGE;
- aio_fill_event(event, iocb, res, res2);
+ *event = iocb->ki_res;
kunmap_atomic(ev_page);
flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
- pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n",
- ctx, tail, iocb, iocb->ki_user_iocb, iocb->ki_user_data,
- res, res2);
+ pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb,
+ (void __user *)(unsigned long)iocb->ki_res.obj,
+ iocb->ki_res.data, iocb->ki_res.res, iocb->ki_res.res2);
/* after flagging the request as done, we
* must never even look at it again
@@ -1148,10 +1145,8 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
* eventfd. The eventfd_signal() function is safe to be called
* from IRQ context.
*/
- if (iocb->ki_eventfd) {
+ if (iocb->ki_eventfd)
eventfd_signal(iocb->ki_eventfd, 1);
- eventfd_ctx_put(iocb->ki_eventfd);
- }
/*
* We have to order our ring_info tail store above and test
@@ -1163,7 +1158,14 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
if (waitqueue_active(&ctx->wait))
wake_up(&ctx->wait);
- iocb_put(iocb);
+}
+
+static inline void iocb_put(struct aio_kiocb *iocb)
+{
+ if (refcount_dec_and_test(&iocb->ki_refcnt)) {
+ aio_complete(iocb);
+ iocb_destroy(iocb);
+ }
}
/* aio_read_events_ring
@@ -1437,7 +1439,9 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
file_end_write(kiocb->ki_filp);
}
- aio_complete(iocb, res, res2);
+ iocb->ki_res.res = res;
+ iocb->ki_res.res2 = res2;
+ iocb_put(iocb);
}
static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
@@ -1514,13 +1518,13 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t ret)
}
}
-static ssize_t aio_read(struct kiocb *req, const struct iocb *iocb,
+static int aio_read(struct kiocb *req, const struct iocb *iocb,
bool vectored, bool compat)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct iov_iter iter;
struct file *file;
- ssize_t ret;
+ int ret;
ret = aio_prep_rw(req, iocb);
if (ret)
@@ -1542,13 +1546,13 @@ static ssize_t aio_read(struct kiocb *req, const struct iocb *iocb,
return ret;
}
-static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb,
+static int aio_write(struct kiocb *req, const struct iocb *iocb,
bool vectored, bool compat)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct iov_iter iter;
struct file *file;
- ssize_t ret;
+ int ret;
ret = aio_prep_rw(req, iocb);
if (ret)
@@ -1585,11 +1589,10 @@ static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb,
static void aio_fsync_work(struct work_struct *work)
{
- struct fsync_iocb *req = container_of(work, struct fsync_iocb, work);
- int ret;
+ struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work);
- ret = vfs_fsync(req->file, req->datasync);
- aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0);
+ iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync);
+ iocb_put(iocb);
}
static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
@@ -1608,11 +1611,6 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
return 0;
}
-static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask)
-{
- aio_complete(iocb, mangle_poll(mask), 0);
-}
-
static void aio_poll_complete_work(struct work_struct *work)
{
struct poll_iocb *req = container_of(work, struct poll_iocb, work);
@@ -1638,9 +1636,11 @@ static void aio_poll_complete_work(struct work_struct *work)
return;
}
list_del_init(&iocb->ki_list);
+ iocb->ki_res.res = mangle_poll(mask);
+ req->done = true;
spin_unlock_irq(&ctx->ctx_lock);
- aio_poll_complete(iocb, mask);
+ iocb_put(iocb);
}
/* assumes we are called with irqs disabled */
@@ -1668,31 +1668,27 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
__poll_t mask = key_to_poll(key);
unsigned long flags;
- req->woken = true;
-
/* for instances that support it check for an event match first: */
- if (mask) {
- if (!(mask & req->events))
- return 0;
+ if (mask && !(mask & req->events))
+ return 0;
+
+ list_del_init(&req->wait.entry);
+ if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
/*
* Try to complete the iocb inline if we can. Use
* irqsave/irqrestore because not all filesystems (e.g. fuse)
* call this function with IRQs disabled and because IRQs
* have to be disabled before ctx_lock is obtained.
*/
- if (spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
- list_del(&iocb->ki_list);
- spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags);
-
- list_del_init(&req->wait.entry);
- aio_poll_complete(iocb, mask);
- return 1;
- }
+ list_del(&iocb->ki_list);
+ iocb->ki_res.res = mangle_poll(mask);
+ req->done = true;
+ spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags);
+ iocb_put(iocb);
+ } else {
+ schedule_work(&req->work);
}
-
- list_del_init(&req->wait.entry);
- schedule_work(&req->work);
return 1;
}
@@ -1719,11 +1715,12 @@ aio_poll_queue_proc(struct file *file, struct wait_queue_head *head,
add_wait_queue(head, &pt->iocb->poll.wait);
}
-static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
+static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
{
struct kioctx *ctx = aiocb->ki_ctx;
struct poll_iocb *req = &aiocb->poll;
struct aio_poll_table apt;
+ bool cancel = false;
__poll_t mask;
/* reject any unknown events outside the normal event mask. */
@@ -1737,7 +1734,7 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
req->head = NULL;
- req->woken = false;
+ req->done = false;
req->cancelled = false;
apt.pt._qproc = aio_poll_queue_proc;
@@ -1749,156 +1746,135 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
INIT_LIST_HEAD(&req->wait.entry);
init_waitqueue_func_entry(&req->wait, aio_poll_wake);
- /* one for removal from waitqueue, one for this function */
- refcount_set(&aiocb->ki_refcnt, 2);
-
mask = vfs_poll(req->file, &apt.pt) & req->events;
- if (unlikely(!req->head)) {
- /* we did not manage to set up a waitqueue, done */
- goto out;
- }
-
spin_lock_irq(&ctx->ctx_lock);
- spin_lock(&req->head->lock);
- if (req->woken) {
- /* wake_up context handles the rest */
- mask = 0;
+ if (likely(req->head)) {
+ spin_lock(&req->head->lock);
+ if (unlikely(list_empty(&req->wait.entry))) {
+ if (apt.error)
+ cancel = true;
+ apt.error = 0;
+ mask = 0;
+ }
+ if (mask || apt.error) {
+ list_del_init(&req->wait.entry);
+ } else if (cancel) {
+ WRITE_ONCE(req->cancelled, true);
+ } else if (!req->done) { /* actually waiting for an event */
+ list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
+ aiocb->ki_cancel = aio_poll_cancel;
+ }
+ spin_unlock(&req->head->lock);
+ }
+ if (mask) { /* no async, we'd stolen it */
+ aiocb->ki_res.res = mangle_poll(mask);
apt.error = 0;
- } else if (mask || apt.error) {
- /* if we get an error or a mask we are done */
- WARN_ON_ONCE(list_empty(&req->wait.entry));
- list_del_init(&req->wait.entry);
- } else {
- /* actually waiting for an event */
- list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
- aiocb->ki_cancel = aio_poll_cancel;
}
- spin_unlock(&req->head->lock);
spin_unlock_irq(&ctx->ctx_lock);
-
-out:
- if (unlikely(apt.error))
- return apt.error;
-
if (mask)
- aio_poll_complete(aiocb, mask);
- iocb_put(aiocb);
- return 0;
+ iocb_put(aiocb);
+ return apt.error;
}
static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
- struct iocb __user *user_iocb, bool compat)
+ struct iocb __user *user_iocb, struct aio_kiocb *req,
+ bool compat)
{
- struct aio_kiocb *req;
- ssize_t ret;
-
- /* enforce forwards compatibility on users */
- if (unlikely(iocb->aio_reserved2)) {
- pr_debug("EINVAL: reserve field set\n");
- return -EINVAL;
- }
-
- /* prevent overflows */
- if (unlikely(
- (iocb->aio_buf != (unsigned long)iocb->aio_buf) ||
- (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) ||
- ((ssize_t)iocb->aio_nbytes < 0)
- )) {
- pr_debug("EINVAL: overflow check\n");
- return -EINVAL;
- }
-
- if (!get_reqs_available(ctx))
- return -EAGAIN;
-
- ret = -EAGAIN;
- req = aio_get_req(ctx);
- if (unlikely(!req))
- goto out_put_reqs_available;
-
req->ki_filp = fget(iocb->aio_fildes);
- ret = -EBADF;
if (unlikely(!req->ki_filp))
- goto out_put_req;
+ return -EBADF;
if (iocb->aio_flags & IOCB_FLAG_RESFD) {
+ struct eventfd_ctx *eventfd;
/*
* If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
* instance of the file* now. The file descriptor must be
* an eventfd() fd, and will be signaled for each completed
* event using the eventfd_signal() function.
*/
- req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd);
- if (IS_ERR(req->ki_eventfd)) {
- ret = PTR_ERR(req->ki_eventfd);
- req->ki_eventfd = NULL;
- goto out_put_req;
- }
+ eventfd = eventfd_ctx_fdget(iocb->aio_resfd);
+ if (IS_ERR(eventfd))
+ return PTR_ERR(eventfd);
+
+ req->ki_eventfd = eventfd;
}
- ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
- if (unlikely(ret)) {
+ if (unlikely(put_user(KIOCB_KEY, &user_iocb->aio_key))) {
pr_debug("EFAULT: aio_key\n");
- goto out_put_req;
+ return -EFAULT;
}
- req->ki_user_iocb = user_iocb;
- req->ki_user_data = iocb->aio_data;
+ req->ki_res.obj = (u64)(unsigned long)user_iocb;
+ req->ki_res.data = iocb->aio_data;
+ req->ki_res.res = 0;
+ req->ki_res.res2 = 0;
switch (iocb->aio_lio_opcode) {
case IOCB_CMD_PREAD:
- ret = aio_read(&req->rw, iocb, false, compat);
- break;
+ return aio_read(&req->rw, iocb, false, compat);
case IOCB_CMD_PWRITE:
- ret = aio_write(&req->rw, iocb, false, compat);
- break;
+ return aio_write(&req->rw, iocb, false, compat);
case IOCB_CMD_PREADV:
- ret = aio_read(&req->rw, iocb, true, compat);
- break;
+ return aio_read(&req->rw, iocb, true, compat);
case IOCB_CMD_PWRITEV:
- ret = aio_write(&req->rw, iocb, true, compat);
- break;
+ return aio_write(&req->rw, iocb, true, compat);
case IOCB_CMD_FSYNC:
- ret = aio_fsync(&req->fsync, iocb, false);
- break;
+ return aio_fsync(&req->fsync, iocb, false);
case IOCB_CMD_FDSYNC:
- ret = aio_fsync(&req->fsync, iocb, true);
- break;
+ return aio_fsync(&req->fsync, iocb, true);
case IOCB_CMD_POLL:
- ret = aio_poll(req, iocb);
- break;
+ return aio_poll(req, iocb);
default:
pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode);
- ret = -EINVAL;
- break;
+ return -EINVAL;
}
-
- /*
- * If ret is 0, we'd either done aio_complete() ourselves or have
- * arranged for that to be done asynchronously. Anything non-zero
- * means that we need to destroy req ourselves.
- */
- if (ret)
- goto out_put_req;
- return 0;
-out_put_req:
- if (req->ki_eventfd)
- eventfd_ctx_put(req->ki_eventfd);
- iocb_put(req);
-out_put_reqs_available:
- put_reqs_available(ctx, 1);
- return ret;
}
static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
bool compat)
{
+ struct aio_kiocb *req;
struct iocb iocb;
+ int err;
if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb))))
return -EFAULT;
- return __io_submit_one(ctx, &iocb, user_iocb, compat);
+ /* enforce forwards compatibility on users */
+ if (unlikely(iocb.aio_reserved2)) {
+ pr_debug("EINVAL: reserve field set\n");
+ return -EINVAL;
+ }
+
+ /* prevent overflows */
+ if (unlikely(
+ (iocb.aio_buf != (unsigned long)iocb.aio_buf) ||
+ (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) ||
+ ((ssize_t)iocb.aio_nbytes < 0)
+ )) {
+ pr_debug("EINVAL: overflow check\n");
+ return -EINVAL;
+ }
+
+ req = aio_get_req(ctx);
+ if (unlikely(!req))
+ return -EAGAIN;
+
+ err = __io_submit_one(ctx, &iocb, user_iocb, req, compat);
+
+ /* Done with the synchronous reference */
+ iocb_put(req);
+
+ /*
+ * If err is 0, we'd either done aio_complete() ourselves or have
+ * arranged for that to be done asynchronously. Anything non-zero
+ * means that we need to destroy req ourselves.
+ */
+ if (unlikely(err)) {
+ iocb_destroy(req);
+ put_reqs_available(ctx, 1);
+ }
+ return err;
}
/* sys_io_submit:
@@ -1997,24 +1973,6 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
}
#endif
-/* lookup_kiocb
- * Finds a given iocb for cancellation.
- */
-static struct aio_kiocb *
-lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb)
-{
- struct aio_kiocb *kiocb;
-
- assert_spin_locked(&ctx->ctx_lock);
-
- /* TODO: use a hash or array, this sucks. */
- list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
- if (kiocb->ki_user_iocb == iocb)
- return kiocb;
- }
- return NULL;
-}
-
/* sys_io_cancel:
* Attempts to cancel an iocb previously passed to io_submit. If
* the operation is successfully cancelled, the resulting event is
@@ -2032,6 +1990,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
struct aio_kiocb *kiocb;
int ret = -EINVAL;
u32 key;
+ u64 obj = (u64)(unsigned long)iocb;
if (unlikely(get_user(key, &iocb->aio_key)))
return -EFAULT;
@@ -2043,10 +2002,13 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
return -EINVAL;
spin_lock_irq(&ctx->ctx_lock);
- kiocb = lookup_kiocb(ctx, iocb);
- if (kiocb) {
- ret = kiocb->ki_cancel(&kiocb->rw);
- list_del_init(&kiocb->ki_list);
+ /* TODO: use a hash or array, this sucks. */
+ list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
+ if (kiocb->ki_res.obj == obj) {
+ ret = kiocb->ki_cancel(&kiocb->rw);
+ list_del_init(&kiocb->ki_list);
+ break;
+ }
}
spin_unlock_irq(&ctx->ctx_lock);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index e9faa52bb489..24615c76c1d0 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -307,10 +307,10 @@ static void blkdev_bio_end_io(struct bio *bio)
struct blkdev_dio *dio = bio->bi_private;
bool should_dirty = dio->should_dirty;
- if (dio->multi_bio && !atomic_dec_and_test(&dio->ref)) {
- if (bio->bi_status && !dio->bio.bi_status)
- dio->bio.bi_status = bio->bi_status;
- } else {
+ if (bio->bi_status && !dio->bio.bi_status)
+ dio->bio.bi_status = bio->bi_status;
+
+ if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) {
if (!dio->is_sync) {
struct kiocb *iocb = dio->iocb;
ssize_t ret;
@@ -336,12 +336,14 @@ static void blkdev_bio_end_io(struct bio *bio)
if (should_dirty) {
bio_check_pages_dirty(bio);
} else {
- struct bio_vec *bvec;
- int i;
- struct bvec_iter_all iter_all;
+ if (!bio_flagged(bio, BIO_NO_PAGE_REF)) {
+ struct bvec_iter_all iter_all;
+ struct bio_vec *bvec;
+ int i;
- bio_for_each_segment_all(bvec, bio, i, iter_all)
- put_page(bvec->bv_page);
+ bio_for_each_segment_all(bvec, bio, i, iter_all)
+ put_page(bvec->bv_page);
+ }
bio_put(bio);
}
}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1d49694e6ae3..c5880329ae37 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6174,7 +6174,7 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
*
* This is overestimating in most cases.
*/
- qgroup_rsv_size = outstanding_extents * fs_info->nodesize;
+ qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize;
spin_lock(&block_rsv->lock);
block_rsv->size = reserve_size;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index ec2d8919e7fb..cd4e693406a0 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -501,6 +501,16 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ /*
+ * If the fs is mounted with nologreplay, which requires it to be
+ * mounted in RO mode as well, we can not allow discard on free space
+ * inside block groups, because log trees refer to extents that are not
+ * pinned in a block group's free space cache (pinning the extents is
+ * precisely the first phase of replaying a log tree).
+ */
+ if (btrfs_test_opt(fs_info, NOLOGREPLAY))
+ return -EROFS;
+
rcu_read_lock();
list_for_each_entry_rcu(device, &fs_info->fs_devices->devices,
dev_list) {
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index dc6140013ae8..61d22a56c0ba 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -366,11 +366,11 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
static int prop_compression_validate(const char *value, size_t len)
{
- if (!strncmp("lzo", value, len))
+ if (!strncmp("lzo", value, 3))
return 0;
- else if (!strncmp("zlib", value, len))
+ else if (!strncmp("zlib", value, 4))
return 0;
- else if (!strncmp("zstd", value, len))
+ else if (!strncmp("zstd", value, 4))
return 0;
return -EINVAL;
@@ -396,7 +396,7 @@ static int prop_compression_apply(struct inode *inode,
btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
} else if (!strncmp("zlib", value, 4)) {
type = BTRFS_COMPRESS_ZLIB;
- } else if (!strncmp("zstd", value, len)) {
+ } else if (!strncmp("zstd", value, 4)) {
type = BTRFS_COMPRESS_ZSTD;
btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
} else {
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index eb680b715dd6..e659d9d61107 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1922,8 +1922,8 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
int i;
/* Level sanity check */
- if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL ||
- root_level < 0 || root_level >= BTRFS_MAX_LEVEL ||
+ if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 ||
+ root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 ||
root_level < cur_level) {
btrfs_err_rl(fs_info,
"%s: bad levels, cur_level=%d root_level=%d",
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 1869ba8e5981..67a6f7d47402 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -2430,8 +2430,9 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
bitmap_clear(rbio->dbitmap, pagenr, 1);
kunmap(p);
- for (stripe = 0; stripe < rbio->real_stripes; stripe++)
+ for (stripe = 0; stripe < nr_data; stripe++)
kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
+ kunmap(p_page);
}
__free_page(p_page);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index acdad6d658f5..e4e665f422fc 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1886,8 +1886,10 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
}
}
-static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
+static inline int btrfs_start_delalloc_flush(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+
/*
* We use writeback_inodes_sb here because if we used
* btrfs_start_delalloc_roots we would deadlock with fs freeze.
@@ -1897,15 +1899,50 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
* from already being in a transaction and our join_transaction doesn't
* have to re-take the fs freeze lock.
*/
- if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
+ if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) {
writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC);
+ } else {
+ struct btrfs_pending_snapshot *pending;
+ struct list_head *head = &trans->transaction->pending_snapshots;
+
+ /*
+ * Flush dellaloc for any root that is going to be snapshotted.
+ * This is done to avoid a corrupted version of files, in the
+ * snapshots, that had both buffered and direct IO writes (even
+ * if they were done sequentially) due to an unordered update of
+ * the inode's size on disk.
+ */
+ list_for_each_entry(pending, head, list) {
+ int ret;
+
+ ret = btrfs_start_delalloc_snapshot(pending->root);
+ if (ret)
+ return ret;
+ }
+ }
return 0;
}
-static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
+static inline void btrfs_wait_delalloc_flush(struct btrfs_trans_handle *trans)
{
- if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+
+ if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) {
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+ } else {
+ struct btrfs_pending_snapshot *pending;
+ struct list_head *head = &trans->transaction->pending_snapshots;
+
+ /*
+ * Wait for any dellaloc that we started previously for the roots
+ * that are going to be snapshotted. This is to avoid a corrupted
+ * version of files in the snapshots that had both buffered and
+ * direct IO writes (even if they were done sequentially).
+ */
+ list_for_each_entry(pending, head, list)
+ btrfs_wait_ordered_extents(pending->root,
+ U64_MAX, 0, U64_MAX);
+ }
}
int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
@@ -2023,7 +2060,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
extwriter_counter_dec(cur_trans, trans->type);
- ret = btrfs_start_delalloc_flush(fs_info);
+ ret = btrfs_start_delalloc_flush(trans);
if (ret)
goto cleanup_transaction;
@@ -2039,7 +2076,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (ret)
goto cleanup_transaction;
- btrfs_wait_delalloc_flush(fs_info);
+ btrfs_wait_delalloc_flush(trans);
btrfs_scrub_pause(fs_info);
/*
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f06454a55e00..561884f60d35 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3578,9 +3578,16 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
- /* find the first key from this transaction again */
+ /*
+ * Find the first key from this transaction again. See the note for
+ * log_new_dir_dentries, if we're logging a directory recursively we
+ * won't be holding its i_mutex, which means we can modify the directory
+ * while we're logging it. If we remove an entry between our first
+ * search and this search we'll not find the key again and can just
+ * bail.
+ */
ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
- if (WARN_ON(ret != 0))
+ if (ret != 0)
goto done;
/*
@@ -4544,6 +4551,19 @@ static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
*size_ret = btrfs_inode_size(path->nodes[0], item);
+ /*
+ * If the in-memory inode's i_size is smaller then the inode
+ * size stored in the btree, return the inode's i_size, so
+ * that we get a correct inode size after replaying the log
+ * when before a power failure we had a shrinking truncate
+ * followed by addition of a new name (rename / new hard link).
+ * Otherwise return the inode size from the btree, to avoid
+ * data loss when replaying a log due to previously doing a
+ * write that expands the inode's size and logging a new name
+ * immediately after.
+ */
+ if (*size_ret > inode->vfs_inode.i_size)
+ *size_ret = inode->vfs_inode.i_size;
}
btrfs_release_path(path);
@@ -4705,15 +4725,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
struct btrfs_file_extent_item);
if (btrfs_file_extent_type(leaf, extent) ==
- BTRFS_FILE_EXTENT_INLINE) {
- len = btrfs_file_extent_ram_bytes(leaf, extent);
- ASSERT(len == i_size ||
- (len == fs_info->sectorsize &&
- btrfs_file_extent_compression(leaf, extent) !=
- BTRFS_COMPRESS_NONE) ||
- (len < i_size && i_size < fs_info->sectorsize));
+ BTRFS_FILE_EXTENT_INLINE)
return 0;
- }
len = btrfs_file_extent_num_bytes(leaf, extent);
/* Last extent goes beyond i_size, no need to log a hole. */
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9024eee889b9..db934ceae9c1 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6407,7 +6407,7 @@ static void btrfs_end_bio(struct bio *bio)
if (bio_op(bio) == REQ_OP_WRITE)
btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_WRITE_ERRS);
- else
+ else if (!(bio->bi_opf & REQ_RAHEAD))
btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_READ_ERRS);
if (bio->bi_opf & REQ_PREFLUSH)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e3346628efe2..2d61ddda9bf5 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -524,6 +524,7 @@ static void ceph_i_callback(struct rcu_head *head)
struct inode *inode = container_of(head, struct inode, i_rcu);
struct ceph_inode_info *ci = ceph_inode(inode);
+ kfree(ci->i_symlink);
kmem_cache_free(ceph_inode_cachep, ci);
}
@@ -566,7 +567,6 @@ void ceph_destroy_inode(struct inode *inode)
}
}
- kfree(ci->i_symlink);
while ((n = rb_first(&ci->i_fragtree)) != NULL) {
frag = rb_entry(n, struct ceph_inode_frag, node);
rb_erase(n, &ci->i_fragtree);
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index e92a2fee3c57..13c1288b04a7 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -115,7 +115,12 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon)
seq_puts(m, " type: CDROM ");
else
seq_printf(m, " type: %d ", dev_type);
- if (tcon->seal)
+
+ seq_printf(m, "Serial Number: 0x%x", tcon->vol_serial_number);
+
+ if ((tcon->seal) ||
+ (tcon->ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) ||
+ (tcon->share_flags & SHI1005_FLAGS_ENCRYPT_DATA))
seq_printf(m, " Encrypted");
if (tcon->nocase)
seq_printf(m, " nocase");
@@ -371,6 +376,10 @@ skip_rdma:
atomic_read(&server->in_send),
atomic_read(&server->num_waiters));
#endif
+ if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA)
+ seq_puts(m, " encrypted");
+ if (ses->sign)
+ seq_puts(m, " signed");
seq_puts(m, "\n\tShares:");
j = 0;
diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h
index d8bce2f862de..086ddc5108af 100644
--- a/fs/cifs/cifs_ioctl.h
+++ b/fs/cifs/cifs_ioctl.h
@@ -43,6 +43,9 @@ struct smb_snapshot_array {
/* snapshots[]; */
} __packed;
+/* query_info flags */
+#define PASSTHRU_QUERY_INFO 0x00000000
+#define PASSTHRU_FSCTL 0x00000001
struct smb_query_info {
__u32 info_type;
__u32 file_info_class;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 217276b8b942..a05bf1d6e1d0 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -559,6 +559,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
tcon->ses->server->echo_interval / HZ);
if (tcon->snapshot_time)
seq_printf(s, ",snapshot=%llu", tcon->snapshot_time);
+ if (tcon->handle_timeout)
+ seq_printf(s, ",handletimeout=%u", tcon->handle_timeout);
/* convert actimeo and display it in seconds */
seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ);
@@ -1008,7 +1010,7 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
unsigned int xid;
int rc;
- if (remap_flags & ~REMAP_FILE_ADVISORY)
+ if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
return -EINVAL;
cifs_dbg(FYI, "clone range\n");
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 142164ef1f05..5c0298b9998f 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -150,5 +150,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.18"
+#define CIFS_VERSION "2.19"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index f293e052e351..585ad3207cb1 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -60,6 +60,12 @@
#define CIFS_MAX_ACTIMEO (1 << 30)
/*
+ * Max persistent and resilient handle timeout (milliseconds).
+ * Windows durable max was 960000 (16 minutes)
+ */
+#define SMB3_MAX_HANDLE_TIMEOUT 960000
+
+/*
* MAX_REQ is the maximum number of requests that WE will send
* on one socket concurrently.
*/
@@ -479,6 +485,14 @@ struct smb_version_operations {
struct cifs_tcon *tcon,
__le16 *path, int is_dir,
unsigned long p);
+ /* make unix special files (block, char, fifo, socket) */
+ int (*make_node)(unsigned int xid,
+ struct inode *inode,
+ struct dentry *dentry,
+ struct cifs_tcon *tcon,
+ char *full_path,
+ umode_t mode,
+ dev_t device_number);
};
struct smb_version_values {
@@ -578,6 +592,7 @@ struct smb_vol {
struct nls_table *local_nls;
unsigned int echo_interval; /* echo interval in secs */
__u64 snapshot_time; /* needed for timewarp tokens */
+ __u32 handle_timeout; /* persistent and durable handle timeout in ms */
unsigned int max_credits; /* smb3 max_credits 10 < credits < 60000 */
};
@@ -735,13 +750,13 @@ in_flight(struct TCP_Server_Info *server)
}
static inline bool
-has_credits(struct TCP_Server_Info *server, int *credits)
+has_credits(struct TCP_Server_Info *server, int *credits, int num_credits)
{
int num;
spin_lock(&server->req_lock);
num = *credits;
spin_unlock(&server->req_lock);
- return num > 0;
+ return num >= num_credits;
}
static inline void
@@ -962,11 +977,14 @@ cap_unix(struct cifs_ses *ses)
struct cached_fid {
bool is_valid:1; /* Do we have a useable root fid */
+ bool file_all_info_is_valid:1;
+
struct kref refcount;
struct cifs_fid *fid;
struct mutex fid_mutex;
struct cifs_tcon *tcon;
struct work_struct lease_break;
+ struct smb2_file_all_info file_all_info;
};
/*
@@ -1047,6 +1065,7 @@ struct cifs_tcon {
__u32 vol_serial_number;
__le64 vol_create_time;
__u64 snapshot_time; /* for timewarp tokens - timestamp of snapshot */
+ __u32 handle_timeout; /* persistent and durable handle timeout in ms */
__u32 ss_flags; /* sector size flags */
__u32 perf_sector_size; /* best sector size for perf */
__u32 max_chunks;
@@ -1314,6 +1333,7 @@ cifsFileInfo_get_locked(struct cifsFileInfo *cifs_file)
}
struct cifsFileInfo *cifsFileInfo_get(struct cifsFileInfo *cifs_file);
+void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_hdlr);
void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
#define CIFS_CACHE_READ_FLG 1
@@ -1735,6 +1755,7 @@ require use of the stronger protocol */
* GlobalMid_Lock protects:
* list operations on pending_mid_q and oplockQ
* updates to XID counters, multiplex id and SMB sequence numbers
+ * list operations on global DnotifyReqList
* tcp_ses_lock protects:
* list operations on tcp and SMB session lists
* tcon->open_file_lock protects the list of open files hanging off the tcon
@@ -1835,6 +1856,7 @@ GLOBAL_EXTERN spinlock_t gidsidlock;
#endif /* CONFIG_CIFS_ACL */
void cifs_oplock_break(struct work_struct *work);
+void cifs_queue_oplock_break(struct cifsFileInfo *cfile);
extern const struct slow_work_ops cifs_oplock_break_ops;
extern struct workqueue_struct *cifsiod_wq;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index b95db2b593cb..4c0e44489f21 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -103,7 +103,7 @@ enum {
Opt_cruid, Opt_gid, Opt_file_mode,
Opt_dirmode, Opt_port,
Opt_blocksize, Opt_rsize, Opt_wsize, Opt_actimeo,
- Opt_echo_interval, Opt_max_credits,
+ Opt_echo_interval, Opt_max_credits, Opt_handletimeout,
Opt_snapshot,
/* Mount options which take string value */
@@ -208,6 +208,7 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_rsize, "rsize=%s" },
{ Opt_wsize, "wsize=%s" },
{ Opt_actimeo, "actimeo=%s" },
+ { Opt_handletimeout, "handletimeout=%s" },
{ Opt_echo_interval, "echo_interval=%s" },
{ Opt_max_credits, "max_credits=%s" },
{ Opt_snapshot, "snapshot=%s" },
@@ -1191,10 +1192,6 @@ next_pdu:
continue;
}
- if (server->large_buf)
- buf = server->bigbuf;
-
-
server->lstrp = jiffies;
for (i = 0; i < num_mids; i++) {
@@ -1623,6 +1620,9 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->actimeo = CIFS_DEF_ACTIMEO;
+ /* Most clients set timeout to 0, allows server to use its default */
+ vol->handle_timeout = 0; /* See MS-SMB2 spec section 2.2.14.2.12 */
+
/* offer SMB2.1 and later (SMB3 etc). Secure and widely accepted */
vol->ops = &smb30_operations;
vol->vals = &smbdefault_values;
@@ -2021,6 +2021,18 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
goto cifs_parse_mount_err;
}
break;
+ case Opt_handletimeout:
+ if (get_option_ul(args, &option)) {
+ cifs_dbg(VFS, "%s: Invalid handletimeout value\n",
+ __func__);
+ goto cifs_parse_mount_err;
+ }
+ vol->handle_timeout = option;
+ if (vol->handle_timeout > SMB3_MAX_HANDLE_TIMEOUT) {
+ cifs_dbg(VFS, "Invalid handle cache timeout, longer than 16 minutes\n");
+ goto cifs_parse_mount_err;
+ }
+ break;
case Opt_echo_interval:
if (get_option_ul(args, &option)) {
cifs_dbg(VFS, "%s: Invalid echo interval value\n",
@@ -3187,6 +3199,8 @@ static int match_tcon(struct cifs_tcon *tcon, struct smb_vol *volume_info)
return 0;
if (tcon->snapshot_time != volume_info->snapshot_time)
return 0;
+ if (tcon->handle_timeout != volume_info->handle_timeout)
+ return 0;
return 1;
}
@@ -3301,6 +3315,16 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
tcon->snapshot_time = volume_info->snapshot_time;
}
+ if (volume_info->handle_timeout) {
+ if (ses->server->vals->protocol_id == 0) {
+ cifs_dbg(VFS,
+ "Use SMB2.1 or later for handle timeout option\n");
+ rc = -EOPNOTSUPP;
+ goto out_fail;
+ } else
+ tcon->handle_timeout = volume_info->handle_timeout;
+ }
+
tcon->ses = ses;
if (volume_info->password) {
tcon->password = kstrdup(volume_info->password, GFP_KERNEL);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 907e85d65bb4..f26a48dd2e39 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -621,20 +621,10 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
{
int rc = -EPERM;
unsigned int xid;
- int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL;
struct cifs_sb_info *cifs_sb;
struct tcon_link *tlink;
struct cifs_tcon *tcon;
- struct cifs_io_parms io_parms;
char *full_path = NULL;
- struct inode *newinode = NULL;
- __u32 oplock = 0;
- struct cifs_fid fid;
- struct cifs_open_parms oparms;
- FILE_ALL_INFO *buf = NULL;
- unsigned int bytes_written;
- struct win_dev *pdev;
- struct kvec iov[2];
if (!old_valid_dev(device_number))
return -EINVAL;
@@ -654,103 +644,12 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
goto mknod_out;
}
- if (tcon->unix_ext) {
- struct cifs_unix_set_info_args args = {
- .mode = mode & ~current_umask(),
- .ctime = NO_CHANGE_64,
- .atime = NO_CHANGE_64,
- .mtime = NO_CHANGE_64,
- .device = device_number,
- };
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
- args.uid = current_fsuid();
- args.gid = current_fsgid();
- } else {
- args.uid = INVALID_UID; /* no change */
- args.gid = INVALID_GID; /* no change */
- }
- rc = CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
- cifs_sb->local_nls,
- cifs_remap(cifs_sb));
- if (rc)
- goto mknod_out;
-
- rc = cifs_get_inode_info_unix(&newinode, full_path,
- inode->i_sb, xid);
-
- if (rc == 0)
- d_instantiate(direntry, newinode);
- goto mknod_out;
- }
-
- if (!S_ISCHR(mode) && !S_ISBLK(mode))
- goto mknod_out;
-
- if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
- goto mknod_out;
-
-
- cifs_dbg(FYI, "sfu compat create special file\n");
-
- buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
- if (buf == NULL) {
- rc = -ENOMEM;
- goto mknod_out;
- }
-
- if (backup_cred(cifs_sb))
- create_options |= CREATE_OPEN_BACKUP_INTENT;
-
- oparms.tcon = tcon;
- oparms.cifs_sb = cifs_sb;
- oparms.desired_access = GENERIC_WRITE;
- oparms.create_options = create_options;
- oparms.disposition = FILE_CREATE;
- oparms.path = full_path;
- oparms.fid = &fid;
- oparms.reconnect = false;
-
- if (tcon->ses->server->oplocks)
- oplock = REQ_OPLOCK;
- else
- oplock = 0;
- rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf);
- if (rc)
- goto mknod_out;
-
- /*
- * BB Do not bother to decode buf since no local inode yet to put
- * timestamps in, but we can reuse it safely.
- */
-
- pdev = (struct win_dev *)buf;
- io_parms.pid = current->tgid;
- io_parms.tcon = tcon;
- io_parms.offset = 0;
- io_parms.length = sizeof(struct win_dev);
- iov[1].iov_base = buf;
- iov[1].iov_len = sizeof(struct win_dev);
- if (S_ISCHR(mode)) {
- memcpy(pdev->type, "IntxCHR", 8);
- pdev->major = cpu_to_le64(MAJOR(device_number));
- pdev->minor = cpu_to_le64(MINOR(device_number));
- rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
- &bytes_written, iov, 1);
- } else if (S_ISBLK(mode)) {
- memcpy(pdev->type, "IntxBLK", 8);
- pdev->major = cpu_to_le64(MAJOR(device_number));
- pdev->minor = cpu_to_le64(MINOR(device_number));
- rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
- &bytes_written, iov, 1);
- }
- tcon->ses->server->ops->close(xid, tcon, &fid);
- d_drop(direntry);
-
- /* FIXME: add code here to set EAs */
+ rc = tcon->ses->server->ops->make_node(xid, inode, direntry, tcon,
+ full_path, mode,
+ device_number);
mknod_out:
kfree(full_path);
- kfree(buf);
free_xid(xid);
cifs_put_tlink(tlink);
return rc;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 4c144c1f50eb..9c0ccc06d172 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -360,13 +360,31 @@ cifsFileInfo_get(struct cifsFileInfo *cifs_file)
return cifs_file;
}
-/*
- * Release a reference on the file private data. This may involve closing
- * the filehandle out on the server. Must be called without holding
- * tcon->open_file_lock and cifs_file->file_info_lock.
+/**
+ * cifsFileInfo_put - release a reference of file priv data
+ *
+ * Always potentially wait for oplock handler. See _cifsFileInfo_put().
*/
void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
{
+ _cifsFileInfo_put(cifs_file, true);
+}
+
+/**
+ * _cifsFileInfo_put - release a reference of file priv data
+ *
+ * This may involve closing the filehandle @cifs_file out on the
+ * server. Must be called without holding tcon->open_file_lock and
+ * cifs_file->file_info_lock.
+ *
+ * If @wait_for_oplock_handler is true and we are releasing the last
+ * reference, wait for any running oplock break handler of the file
+ * and cancel any pending one. If calling this function from the
+ * oplock break handler, you need to pass false.
+ *
+ */
+void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler)
+{
struct inode *inode = d_inode(cifs_file->dentry);
struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink);
struct TCP_Server_Info *server = tcon->ses->server;
@@ -414,7 +432,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
spin_unlock(&tcon->open_file_lock);
- oplock_break_cancelled = cancel_work_sync(&cifs_file->oplock_break);
+ oplock_break_cancelled = wait_oplock_handler ?
+ cancel_work_sync(&cifs_file->oplock_break) : false;
if (!tcon->need_reconnect && !cifs_file->invalidHandle) {
struct TCP_Server_Info *server = tcon->ses->server;
@@ -1645,8 +1664,20 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
rc = server->ops->mand_unlock_range(cfile, flock, xid);
out:
- if (flock->fl_flags & FL_POSIX && !rc)
+ if (flock->fl_flags & FL_POSIX) {
+ /*
+ * If this is a request to remove all locks because we
+ * are closing the file, it doesn't matter if the
+ * unlocking failed as both cifs.ko and the SMB server
+ * remove the lock on file close
+ */
+ if (rc) {
+ cifs_dbg(VFS, "%s failed rc=%d\n", __func__, rc);
+ if (!(flock->fl_flags & FL_CLOSE))
+ return rc;
+ }
rc = locks_lock_file_wait(file, flock);
+ }
return rc;
}
@@ -2620,43 +2651,56 @@ cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list,
struct TCP_Server_Info *server =
tlink_tcon(wdata->cfile->tlink)->ses->server;
- /*
- * Wait for credits to resend this wdata.
- * Note: we are attempting to resend the whole wdata not in segments
- */
do {
- rc = server->ops->wait_mtu_credits(server, wdata->bytes, &wsize,
- &credits);
+ if (wdata->cfile->invalidHandle) {
+ rc = cifs_reopen_file(wdata->cfile, false);
+ if (rc == -EAGAIN)
+ continue;
+ else if (rc)
+ break;
+ }
- if (rc)
- goto out;
- if (wsize < wdata->bytes) {
- add_credits_and_wake_if(server, &credits, 0);
- msleep(1000);
- }
- } while (wsize < wdata->bytes);
+ /*
+ * Wait for credits to resend this wdata.
+ * Note: we are attempting to resend the whole wdata not in
+ * segments
+ */
+ do {
+ rc = server->ops->wait_mtu_credits(server, wdata->bytes,
+ &wsize, &credits);
+ if (rc)
+ goto fail;
+
+ if (wsize < wdata->bytes) {
+ add_credits_and_wake_if(server, &credits, 0);
+ msleep(1000);
+ }
+ } while (wsize < wdata->bytes);
+ wdata->credits = credits;
- wdata->credits = credits;
- rc = -EAGAIN;
- while (rc == -EAGAIN) {
- rc = 0;
- if (wdata->cfile->invalidHandle)
- rc = cifs_reopen_file(wdata->cfile, false);
- if (!rc)
- rc = server->ops->async_writev(wdata,
+ rc = adjust_credits(server, &wdata->credits, wdata->bytes);
+
+ if (!rc) {
+ if (wdata->cfile->invalidHandle)
+ rc = -EAGAIN;
+ else
+ rc = server->ops->async_writev(wdata,
cifs_uncached_writedata_release);
- }
+ }
- if (!rc) {
- list_add_tail(&wdata->list, wdata_list);
- return 0;
- }
+ /* If the write was successfully sent, we are done */
+ if (!rc) {
+ list_add_tail(&wdata->list, wdata_list);
+ return 0;
+ }
- add_credits_and_wake_if(server, &wdata->credits, 0);
-out:
- kref_put(&wdata->refcount, cifs_uncached_writedata_release);
+ /* Roll back credits and retry if needed */
+ add_credits_and_wake_if(server, &wdata->credits, 0);
+ } while (rc == -EAGAIN);
+fail:
+ kref_put(&wdata->refcount, cifs_uncached_writedata_release);
return rc;
}
@@ -2884,12 +2928,12 @@ restart_loop:
wdata->bytes, &tmp_from,
ctx->cfile, cifs_sb, &tmp_list,
ctx);
+
+ kref_put(&wdata->refcount,
+ cifs_uncached_writedata_release);
}
list_splice(&tmp_list, &ctx->list);
-
- kref_put(&wdata->refcount,
- cifs_uncached_writedata_release);
goto restart_loop;
}
}
@@ -3336,44 +3380,55 @@ static int cifs_resend_rdata(struct cifs_readdata *rdata,
struct TCP_Server_Info *server =
tlink_tcon(rdata->cfile->tlink)->ses->server;
- /*
- * Wait for credits to resend this rdata.
- * Note: we are attempting to resend the whole rdata not in segments
- */
do {
- rc = server->ops->wait_mtu_credits(server, rdata->bytes,
+ if (rdata->cfile->invalidHandle) {
+ rc = cifs_reopen_file(rdata->cfile, true);
+ if (rc == -EAGAIN)
+ continue;
+ else if (rc)
+ break;
+ }
+
+ /*
+ * Wait for credits to resend this rdata.
+ * Note: we are attempting to resend the whole rdata not in
+ * segments
+ */
+ do {
+ rc = server->ops->wait_mtu_credits(server, rdata->bytes,
&rsize, &credits);
- if (rc)
- goto out;
+ if (rc)
+ goto fail;
- if (rsize < rdata->bytes) {
- add_credits_and_wake_if(server, &credits, 0);
- msleep(1000);
- }
- } while (rsize < rdata->bytes);
+ if (rsize < rdata->bytes) {
+ add_credits_and_wake_if(server, &credits, 0);
+ msleep(1000);
+ }
+ } while (rsize < rdata->bytes);
+ rdata->credits = credits;
- rdata->credits = credits;
- rc = -EAGAIN;
- while (rc == -EAGAIN) {
- rc = 0;
- if (rdata->cfile->invalidHandle)
- rc = cifs_reopen_file(rdata->cfile, true);
- if (!rc)
- rc = server->ops->async_readv(rdata);
- }
+ rc = adjust_credits(server, &rdata->credits, rdata->bytes);
+ if (!rc) {
+ if (rdata->cfile->invalidHandle)
+ rc = -EAGAIN;
+ else
+ rc = server->ops->async_readv(rdata);
+ }
- if (!rc) {
- /* Add to aio pending list */
- list_add_tail(&rdata->list, rdata_list);
- return 0;
- }
+ /* If the read was successfully sent, we are done */
+ if (!rc) {
+ /* Add to aio pending list */
+ list_add_tail(&rdata->list, rdata_list);
+ return 0;
+ }
- add_credits_and_wake_if(server, &rdata->credits, 0);
-out:
- kref_put(&rdata->refcount,
- cifs_uncached_readdata_release);
+ /* Roll back credits and retry if needed */
+ add_credits_and_wake_if(server, &rdata->credits, 0);
+ } while (rc == -EAGAIN);
+fail:
+ kref_put(&rdata->refcount, cifs_uncached_readdata_release);
return rc;
}
@@ -4567,6 +4622,7 @@ void cifs_oplock_break(struct work_struct *work)
cinode);
cifs_dbg(FYI, "Oplock release rc = %d\n", rc);
}
+ _cifsFileInfo_put(cfile, false /* do not wait for ourself */);
cifs_done_oplock_break(cinode);
}
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index bee203055b30..1e1626a2cfc3 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -501,8 +501,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
&pCifsInode->flags);
- queue_work(cifsoplockd_wq,
- &netfile->oplock_break);
+ cifs_queue_oplock_break(netfile);
netfile->oplock_break_cancelled = false;
spin_unlock(&tcon->open_file_lock);
@@ -607,6 +606,28 @@ void cifs_put_writer(struct cifsInodeInfo *cinode)
spin_unlock(&cinode->writers_lock);
}
+/**
+ * cifs_queue_oplock_break - queue the oplock break handler for cfile
+ *
+ * This function is called from the demultiplex thread when it
+ * receives an oplock break for @cfile.
+ *
+ * Assumes the tcon->open_file_lock is held.
+ * Assumes cfile->file_info_lock is NOT held.
+ */
+void cifs_queue_oplock_break(struct cifsFileInfo *cfile)
+{
+ /*
+ * Bump the handle refcount now while we hold the
+ * open_file_lock to enforce the validity of it for the oplock
+ * break handler. The matching put is done at the end of the
+ * handler.
+ */
+ cifsFileInfo_get(cfile);
+
+ queue_work(cifsoplockd_wq, &cfile->oplock_break);
+}
+
void cifs_done_oplock_break(struct cifsInodeInfo *cinode)
{
clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags);
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index f0ce27c3c6e4..c711f1f39bf2 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -1027,6 +1027,131 @@ cifs_can_echo(struct TCP_Server_Info *server)
return false;
}
+static int
+cifs_make_node(unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ char *full_path, umode_t mode, dev_t dev)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct inode *newinode = NULL;
+ int rc = -EPERM;
+ int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL;
+ FILE_ALL_INFO *buf = NULL;
+ struct cifs_io_parms io_parms;
+ __u32 oplock = 0;
+ struct cifs_fid fid;
+ struct cifs_open_parms oparms;
+ unsigned int bytes_written;
+ struct win_dev *pdev;
+ struct kvec iov[2];
+
+ if (tcon->unix_ext) {
+ /*
+ * SMB1 Unix Extensions: requires server support but
+ * works with all special files
+ */
+ struct cifs_unix_set_info_args args = {
+ .mode = mode & ~current_umask(),
+ .ctime = NO_CHANGE_64,
+ .atime = NO_CHANGE_64,
+ .mtime = NO_CHANGE_64,
+ .device = dev,
+ };
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
+ args.uid = current_fsuid();
+ args.gid = current_fsgid();
+ } else {
+ args.uid = INVALID_UID; /* no change */
+ args.gid = INVALID_GID; /* no change */
+ }
+ rc = CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
+ cifs_sb->local_nls,
+ cifs_remap(cifs_sb));
+ if (rc)
+ goto out;
+
+ rc = cifs_get_inode_info_unix(&newinode, full_path,
+ inode->i_sb, xid);
+
+ if (rc == 0)
+ d_instantiate(dentry, newinode);
+ goto out;
+ }
+
+ /*
+ * SMB1 SFU emulation: should work with all servers, but only
+ * support block and char device (no socket & fifo)
+ */
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
+ goto out;
+
+ if (!S_ISCHR(mode) && !S_ISBLK(mode))
+ goto out;
+
+ cifs_dbg(FYI, "sfu compat create special file\n");
+
+ buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+ if (buf == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ if (backup_cred(cifs_sb))
+ create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+ oparms.tcon = tcon;
+ oparms.cifs_sb = cifs_sb;
+ oparms.desired_access = GENERIC_WRITE;
+ oparms.create_options = create_options;
+ oparms.disposition = FILE_CREATE;
+ oparms.path = full_path;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ if (tcon->ses->server->oplocks)
+ oplock = REQ_OPLOCK;
+ else
+ oplock = 0;
+ rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf);
+ if (rc)
+ goto out;
+
+ /*
+ * BB Do not bother to decode buf since no local inode yet to put
+ * timestamps in, but we can reuse it safely.
+ */
+
+ pdev = (struct win_dev *)buf;
+ io_parms.pid = current->tgid;
+ io_parms.tcon = tcon;
+ io_parms.offset = 0;
+ io_parms.length = sizeof(struct win_dev);
+ iov[1].iov_base = buf;
+ iov[1].iov_len = sizeof(struct win_dev);
+ if (S_ISCHR(mode)) {
+ memcpy(pdev->type, "IntxCHR", 8);
+ pdev->major = cpu_to_le64(MAJOR(dev));
+ pdev->minor = cpu_to_le64(MINOR(dev));
+ rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
+ &bytes_written, iov, 1);
+ } else if (S_ISBLK(mode)) {
+ memcpy(pdev->type, "IntxBLK", 8);
+ pdev->major = cpu_to_le64(MAJOR(dev));
+ pdev->minor = cpu_to_le64(MINOR(dev));
+ rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
+ &bytes_written, iov, 1);
+ }
+ tcon->ses->server->ops->close(xid, tcon, &fid);
+ d_drop(dentry);
+
+ /* FIXME: add code here to set EAs */
+out:
+ kfree(buf);
+ return rc;
+}
+
+
+
struct smb_version_operations smb1_operations = {
.send_cancel = send_nt_cancel,
.compare_fids = cifs_compare_fids,
@@ -1110,6 +1235,7 @@ struct smb_version_operations smb1_operations = {
.get_acl_by_fid = get_cifs_acl_by_fid,
.set_acl = set_cifs_acl,
#endif /* CIFS_ACL */
+ .make_node = cifs_make_node,
};
struct smb_version_values smb1_values = {
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index b204e84b87fb..54bffb2a1786 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -68,13 +68,15 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
if (oparms->tcon->use_resilient) {
- nr_ioctl_req.Timeout = 0; /* use server default (120 seconds) */
+ /* default timeout is 0, servers pick default (120 seconds) */
+ nr_ioctl_req.Timeout =
+ cpu_to_le32(oparms->tcon->handle_timeout);
nr_ioctl_req.Reserved = 0;
rc = SMB2_ioctl(xid, oparms->tcon, fid->persistent_fid,
fid->volatile_fid, FSCTL_LMR_REQUEST_RESILIENCY,
true /* is_fsctl */,
(char *)&nr_ioctl_req, sizeof(nr_ioctl_req),
- NULL, NULL /* no return info */);
+ CIFSMaxBufSize, NULL, NULL /* no return info */);
if (rc == -EOPNOTSUPP) {
cifs_dbg(VFS,
"resiliency not supported by server, disabling\n");
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 01a76bccdb8d..278405d26c47 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -37,6 +37,16 @@
#include "smb2pdu.h"
#include "smb2proto.h"
+static void
+free_set_inf_compound(struct smb_rqst *rqst)
+{
+ if (rqst[1].rq_iov)
+ SMB2_set_info_free(&rqst[1]);
+ if (rqst[2].rq_iov)
+ SMB2_close_free(&rqst[2]);
+}
+
+
static int
smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path,
@@ -112,14 +122,18 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
PATH_MAX * 2, 0, NULL);
smb2_set_next_command(tcon, &rqst[num_rqst]);
smb2_set_related(&rqst[num_rqst++]);
+ trace_smb3_query_info_compound_enter(xid, ses->Suid, tcon->tid,
+ full_path);
break;
case SMB2_OP_DELETE:
+ trace_smb3_delete_enter(xid, ses->Suid, tcon->tid, full_path);
break;
case SMB2_OP_MKDIR:
/*
* Directories are created through parameters in the
* SMB2_open() call.
*/
+ trace_smb3_mkdir_enter(xid, ses->Suid, tcon->tid, full_path);
break;
case SMB2_OP_RMDIR:
memset(&si_iov, 0, sizeof(si_iov));
@@ -135,6 +149,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
SMB2_O_INFO_FILE, 0, data, size);
smb2_set_next_command(tcon, &rqst[num_rqst]);
smb2_set_related(&rqst[num_rqst++]);
+ trace_smb3_rmdir_enter(xid, ses->Suid, tcon->tid, full_path);
break;
case SMB2_OP_SET_EOF:
memset(&si_iov, 0, sizeof(si_iov));
@@ -150,6 +165,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
SMB2_O_INFO_FILE, 0, data, size);
smb2_set_next_command(tcon, &rqst[num_rqst]);
smb2_set_related(&rqst[num_rqst++]);
+ trace_smb3_set_eof_enter(xid, ses->Suid, tcon->tid, full_path);
break;
case SMB2_OP_SET_INFO:
memset(&si_iov, 0, sizeof(si_iov));
@@ -166,6 +182,8 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
SMB2_O_INFO_FILE, 0, data, size);
smb2_set_next_command(tcon, &rqst[num_rqst]);
smb2_set_related(&rqst[num_rqst++]);
+ trace_smb3_set_info_compound_enter(xid, ses->Suid, tcon->tid,
+ full_path);
break;
case SMB2_OP_RENAME:
memset(&si_iov, 0, sizeof(si_iov));
@@ -190,6 +208,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
SMB2_O_INFO_FILE, 0, data, size);
smb2_set_next_command(tcon, &rqst[num_rqst]);
smb2_set_related(&rqst[num_rqst++]);
+ trace_smb3_rename_enter(xid, ses->Suid, tcon->tid, full_path);
break;
case SMB2_OP_HARDLINK:
memset(&si_iov, 0, sizeof(si_iov));
@@ -214,6 +233,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
SMB2_O_INFO_FILE, 0, data, size);
smb2_set_next_command(tcon, &rqst[num_rqst]);
smb2_set_related(&rqst[num_rqst++]);
+ trace_smb3_hardlink_enter(xid, ses->Suid, tcon->tid, full_path);
break;
default:
cifs_dbg(VFS, "Invalid command\n");
@@ -252,21 +272,65 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
SMB2_query_info_free(&rqst[1]);
if (rqst[2].rq_iov)
SMB2_close_free(&rqst[2]);
+ if (rc)
+ trace_smb3_query_info_compound_err(xid, ses->Suid,
+ tcon->tid, rc);
+ else
+ trace_smb3_query_info_compound_done(xid, ses->Suid,
+ tcon->tid);
break;
case SMB2_OP_DELETE:
+ if (rc)
+ trace_smb3_delete_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_delete_done(xid, ses->Suid, tcon->tid);
+ if (rqst[1].rq_iov)
+ SMB2_close_free(&rqst[1]);
+ break;
case SMB2_OP_MKDIR:
+ if (rc)
+ trace_smb3_mkdir_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_mkdir_done(xid, ses->Suid, tcon->tid);
if (rqst[1].rq_iov)
SMB2_close_free(&rqst[1]);
break;
case SMB2_OP_HARDLINK:
+ if (rc)
+ trace_smb3_hardlink_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_hardlink_done(xid, ses->Suid, tcon->tid);
+ free_set_inf_compound(rqst);
+ break;
case SMB2_OP_RENAME:
+ if (rc)
+ trace_smb3_rename_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_rename_done(xid, ses->Suid, tcon->tid);
+ free_set_inf_compound(rqst);
+ break;
case SMB2_OP_RMDIR:
+ if (rc)
+ trace_smb3_rmdir_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_rmdir_done(xid, ses->Suid, tcon->tid);
+ free_set_inf_compound(rqst);
+ break;
case SMB2_OP_SET_EOF:
+ if (rc)
+ trace_smb3_set_eof_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_set_eof_done(xid, ses->Suid, tcon->tid);
+ free_set_inf_compound(rqst);
+ break;
case SMB2_OP_SET_INFO:
- if (rqst[1].rq_iov)
- SMB2_set_info_free(&rqst[1]);
- if (rqst[2].rq_iov)
- SMB2_close_free(&rqst[2]);
+ if (rc)
+ trace_smb3_set_info_compound_err(xid, ses->Suid,
+ tcon->tid, rc);
+ else
+ trace_smb3_set_info_compound_done(xid, ses->Suid,
+ tcon->tid);
+ free_set_inf_compound(rqst);
break;
}
free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
@@ -309,12 +373,17 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
rc = open_shroot(xid, tcon, &fid);
if (rc)
goto out;
- rc = SMB2_query_info(xid, tcon, fid.persistent_fid,
- fid.volatile_fid, smb2_data);
+
+ if (tcon->crfid.file_all_info_is_valid) {
+ move_smb2_info_to_cifs(data,
+ &tcon->crfid.file_all_info);
+ } else {
+ rc = SMB2_query_info(xid, tcon, fid.persistent_fid,
+ fid.volatile_fid, smb2_data);
+ if (!rc)
+ move_smb2_info_to_cifs(data, smb2_data);
+ }
close_shroot(&tcon->crfid);
- if (rc)
- goto out;
- move_smb2_info_to_cifs(data, smb2_data);
goto out;
}
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index 924269cec135..e32c264e3adb 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -1036,7 +1036,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
{STATUS_UNFINISHED_CONTEXT_DELETED, -EIO,
"STATUS_UNFINISHED_CONTEXT_DELETED"},
{STATUS_NO_TGT_REPLY, -EIO, "STATUS_NO_TGT_REPLY"},
- {STATUS_OBJECTID_NOT_FOUND, -EIO, "STATUS_OBJECTID_NOT_FOUND"},
+ /* Note that ENOATTTR and ENODATA are the same errno */
+ {STATUS_OBJECTID_NOT_FOUND, -ENODATA, "STATUS_OBJECTID_NOT_FOUND"},
{STATUS_NO_IP_ADDRESSES, -EIO, "STATUS_NO_IP_ADDRESSES"},
{STATUS_WRONG_CREDENTIAL_HANDLE, -EIO,
"STATUS_WRONG_CREDENTIAL_HANDLE"},
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 0e3570e40ff8..e311f58dc1c8 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -555,7 +555,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
&cinode->flags);
- queue_work(cifsoplockd_wq, &cfile->oplock_break);
+ cifs_queue_oplock_break(cfile);
kfree(lw);
return true;
}
@@ -712,8 +712,8 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
&cinode->flags);
spin_unlock(&cfile->file_info_lock);
- queue_work(cifsoplockd_wq,
- &cfile->oplock_break);
+
+ cifs_queue_oplock_break(cfile);
spin_unlock(&tcon->open_file_lock);
spin_unlock(&cifs_tcp_ses_lock);
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 085e91436da7..c36ff0d1fe2a 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -185,7 +185,7 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
spin_unlock(&server->req_lock);
cifs_num_waiters_inc(server);
rc = wait_event_killable(server->request_q,
- has_credits(server, &server->credits));
+ has_credits(server, &server->credits, 1));
cifs_num_waiters_dec(server);
if (rc)
return rc;
@@ -581,7 +581,7 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)
rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
FSCTL_QUERY_NETWORK_INTERFACE_INFO, true /* is_fsctl */,
NULL /* no data input */, 0 /* no data input */,
- (char **)&out_buf, &ret_data_len);
+ CIFSMaxBufSize, (char **)&out_buf, &ret_data_len);
if (rc == -EOPNOTSUPP) {
cifs_dbg(FYI,
"server does not support query network interfaces\n");
@@ -619,6 +619,7 @@ smb2_close_cached_fid(struct kref *ref)
SMB2_close(0, cfid->tcon, cfid->fid->persistent_fid,
cfid->fid->volatile_fid);
cfid->is_valid = false;
+ cfid->file_all_info_is_valid = false;
}
}
@@ -643,9 +644,18 @@ smb2_cached_lease_break(struct work_struct *work)
*/
int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid)
{
- struct cifs_open_parms oparams;
- int rc;
- __le16 srch_path = 0; /* Null - since an open of top of share */
+ struct cifs_ses *ses = tcon->ses;
+ struct TCP_Server_Info *server = ses->server;
+ struct cifs_open_parms oparms;
+ struct smb2_create_rsp *o_rsp = NULL;
+ struct smb2_query_info_rsp *qi_rsp = NULL;
+ int resp_buftype[2];
+ struct smb_rqst rqst[2];
+ struct kvec rsp_iov[2];
+ struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
+ struct kvec qi_iov[1];
+ int rc, flags = 0;
+ __le16 utf16_path = 0; /* Null - since an open of top of share */
u8 oplock = SMB2_OPLOCK_LEVEL_II;
mutex_lock(&tcon->crfid.fid_mutex);
@@ -657,22 +667,85 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid)
return 0;
}
- oparams.tcon = tcon;
- oparams.create_options = 0;
- oparams.desired_access = FILE_READ_ATTRIBUTES;
- oparams.disposition = FILE_OPEN;
- oparams.fid = pfid;
- oparams.reconnect = false;
-
- rc = SMB2_open(xid, &oparams, &srch_path, &oplock, NULL, NULL, NULL);
- if (rc == 0) {
- memcpy(tcon->crfid.fid, pfid, sizeof(struct cifs_fid));
- tcon->crfid.tcon = tcon;
- tcon->crfid.is_valid = true;
- kref_init(&tcon->crfid.refcount);
+ if (smb3_encryption_required(tcon))
+ flags |= CIFS_TRANSFORM_REQ;
+
+ memset(rqst, 0, sizeof(rqst));
+ resp_buftype[0] = resp_buftype[1] = CIFS_NO_BUFFER;
+ memset(rsp_iov, 0, sizeof(rsp_iov));
+
+ /* Open */
+ memset(&open_iov, 0, sizeof(open_iov));
+ rqst[0].rq_iov = open_iov;
+ rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
+
+ oparms.tcon = tcon;
+ oparms.create_options = 0;
+ oparms.desired_access = FILE_READ_ATTRIBUTES;
+ oparms.disposition = FILE_OPEN;
+ oparms.fid = pfid;
+ oparms.reconnect = false;
+
+ rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, &utf16_path);
+ if (rc)
+ goto oshr_exit;
+ smb2_set_next_command(tcon, &rqst[0]);
+
+ memset(&qi_iov, 0, sizeof(qi_iov));
+ rqst[1].rq_iov = qi_iov;
+ rqst[1].rq_nvec = 1;
+
+ rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID,
+ COMPOUND_FID, FILE_ALL_INFORMATION,
+ SMB2_O_INFO_FILE, 0,
+ sizeof(struct smb2_file_all_info) +
+ PATH_MAX * 2, 0, NULL);
+ if (rc)
+ goto oshr_exit;
+
+ smb2_set_related(&rqst[1]);
+
+ rc = compound_send_recv(xid, ses, flags, 2, rqst,
+ resp_buftype, rsp_iov);
+ if (rc)
+ goto oshr_exit;
+
+ o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base;
+ oparms.fid->persistent_fid = o_rsp->PersistentFileId;
+ oparms.fid->volatile_fid = o_rsp->VolatileFileId;
+#ifdef CONFIG_CIFS_DEBUG2
+ oparms.fid->mid = le64_to_cpu(o_rsp->sync_hdr.MessageId);
+#endif /* CIFS_DEBUG2 */
+
+ memcpy(tcon->crfid.fid, pfid, sizeof(struct cifs_fid));
+ tcon->crfid.tcon = tcon;
+ tcon->crfid.is_valid = true;
+ kref_init(&tcon->crfid.refcount);
+
+ if (o_rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) {
kref_get(&tcon->crfid.refcount);
- }
+ oplock = smb2_parse_lease_state(server, o_rsp,
+ &oparms.fid->epoch,
+ oparms.fid->lease_key);
+ } else
+ goto oshr_exit;
+
+ qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
+ if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info))
+ goto oshr_exit;
+ if (!smb2_validate_and_copy_iov(
+ le16_to_cpu(qi_rsp->OutputBufferOffset),
+ sizeof(struct smb2_file_all_info),
+ &rsp_iov[1], sizeof(struct smb2_file_all_info),
+ (char *)&tcon->crfid.file_all_info))
+ tcon->crfid.file_all_info_is_valid = 1;
+
+ oshr_exit:
mutex_unlock(&tcon->crfid.fid_mutex);
+ SMB2_open_free(&rqst[0]);
+ SMB2_query_info_free(&rqst[1]);
+ free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
+ free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
return rc;
}
@@ -1222,7 +1295,7 @@ SMB2_request_res_key(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid,
FSCTL_SRV_REQUEST_RESUME_KEY, true /* is_fsctl */,
- NULL, 0 /* no input */,
+ NULL, 0 /* no input */, CIFSMaxBufSize,
(char **)&res_key, &ret_data_len);
if (rc) {
@@ -1253,7 +1326,8 @@ smb2_ioctl_query_info(const unsigned int xid,
struct smb_query_info __user *pqi;
int rc = 0;
int flags = 0;
- struct smb2_query_info_rsp *rsp = NULL;
+ struct smb2_query_info_rsp *qi_rsp = NULL;
+ struct smb2_ioctl_rsp *io_rsp = NULL;
void *buffer = NULL;
struct smb_rqst rqst[3];
int resp_buftype[3];
@@ -1263,6 +1337,7 @@ smb2_ioctl_query_info(const unsigned int xid,
u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
struct cifs_fid fid;
struct kvec qi_iov[1];
+ struct kvec io_iov[SMB2_IOCTL_IOV_SIZE];
struct kvec close_iov[1];
memset(rqst, 0, sizeof(rqst));
@@ -1313,15 +1388,35 @@ smb2_ioctl_query_info(const unsigned int xid,
smb2_set_next_command(tcon, &rqst[0]);
/* Query */
- memset(&qi_iov, 0, sizeof(qi_iov));
- rqst[1].rq_iov = qi_iov;
- rqst[1].rq_nvec = 1;
-
- rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID, COMPOUND_FID,
- qi.file_info_class, qi.info_type,
- qi.additional_information,
+ if (qi.flags & PASSTHRU_FSCTL) {
+ /* Can eventually relax perm check since server enforces too */
+ if (!capable(CAP_SYS_ADMIN))
+ rc = -EPERM;
+ else {
+ memset(&io_iov, 0, sizeof(io_iov));
+ rqst[1].rq_iov = io_iov;
+ rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE;
+
+ rc = SMB2_ioctl_init(tcon, &rqst[1],
+ COMPOUND_FID, COMPOUND_FID,
+ qi.info_type, true, NULL,
+ 0, CIFSMaxBufSize);
+ }
+ } else if (qi.flags == PASSTHRU_QUERY_INFO) {
+ memset(&qi_iov, 0, sizeof(qi_iov));
+ rqst[1].rq_iov = qi_iov;
+ rqst[1].rq_nvec = 1;
+
+ rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID,
+ COMPOUND_FID, qi.file_info_class,
+ qi.info_type, qi.additional_information,
qi.input_buffer_length,
qi.output_buffer_length, buffer);
+ } else { /* unknown flags */
+ cifs_dbg(VFS, "invalid passthru query flags: 0x%x\n", qi.flags);
+ rc = -EINVAL;
+ }
+
if (rc)
goto iqinf_exit;
smb2_set_next_command(tcon, &rqst[1]);
@@ -1341,24 +1436,44 @@ smb2_ioctl_query_info(const unsigned int xid,
resp_buftype, rsp_iov);
if (rc)
goto iqinf_exit;
- pqi = (struct smb_query_info __user *)arg;
- rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
- if (le32_to_cpu(rsp->OutputBufferLength) < qi.input_buffer_length)
- qi.input_buffer_length = le32_to_cpu(rsp->OutputBufferLength);
- if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length,
- sizeof(qi.input_buffer_length))) {
- rc = -EFAULT;
- goto iqinf_exit;
- }
- if (copy_to_user(pqi + 1, rsp->Buffer, qi.input_buffer_length)) {
- rc = -EFAULT;
- goto iqinf_exit;
+ if (qi.flags & PASSTHRU_FSCTL) {
+ pqi = (struct smb_query_info __user *)arg;
+ io_rsp = (struct smb2_ioctl_rsp *)rsp_iov[1].iov_base;
+ if (le32_to_cpu(io_rsp->OutputCount) < qi.input_buffer_length)
+ qi.input_buffer_length = le32_to_cpu(io_rsp->OutputCount);
+ if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length,
+ sizeof(qi.input_buffer_length))) {
+ rc = -EFAULT;
+ goto iqinf_exit;
+ }
+ if (copy_to_user(pqi + 1, &io_rsp[1], qi.input_buffer_length)) {
+ rc = -EFAULT;
+ goto iqinf_exit;
+ }
+ } else {
+ pqi = (struct smb_query_info __user *)arg;
+ qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
+ if (le32_to_cpu(qi_rsp->OutputBufferLength) < qi.input_buffer_length)
+ qi.input_buffer_length = le32_to_cpu(qi_rsp->OutputBufferLength);
+ if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length,
+ sizeof(qi.input_buffer_length))) {
+ rc = -EFAULT;
+ goto iqinf_exit;
+ }
+ if (copy_to_user(pqi + 1, qi_rsp->Buffer, qi.input_buffer_length)) {
+ rc = -EFAULT;
+ goto iqinf_exit;
+ }
}
iqinf_exit:
kfree(buffer);
SMB2_open_free(&rqst[0]);
- SMB2_query_info_free(&rqst[1]);
+ if (qi.flags & PASSTHRU_FSCTL)
+ SMB2_ioctl_free(&rqst[1]);
+ else
+ SMB2_query_info_free(&rqst[1]);
+
SMB2_close_free(&rqst[2]);
free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
@@ -1413,8 +1528,8 @@ smb2_copychunk_range(const unsigned int xid,
rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid,
trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE,
true /* is_fsctl */, (char *)pcchunk,
- sizeof(struct copychunk_ioctl), (char **)&retbuf,
- &ret_data_len);
+ sizeof(struct copychunk_ioctl), CIFSMaxBufSize,
+ (char **)&retbuf, &ret_data_len);
if (rc == 0) {
if (ret_data_len !=
sizeof(struct copychunk_ioctl_rsp)) {
@@ -1574,7 +1689,7 @@ static bool smb2_set_sparse(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid, FSCTL_SET_SPARSE,
true /* is_fctl */,
- &setsparse, 1, NULL, NULL);
+ &setsparse, 1, CIFSMaxBufSize, NULL, NULL);
if (rc) {
tcon->broken_sparse_sup = true;
cifs_dbg(FYI, "set sparse rc = %d\n", rc);
@@ -1647,7 +1762,7 @@ smb2_duplicate_extents(const unsigned int xid,
true /* is_fsctl */,
(char *)&dup_ext_buf,
sizeof(struct duplicate_extents_to_file),
- NULL,
+ CIFSMaxBufSize, NULL,
&ret_data_len);
if (ret_data_len > 0)
@@ -1682,7 +1797,7 @@ smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon,
true /* is_fsctl */,
(char *)&integr_info,
sizeof(struct fsctl_set_integrity_information_req),
- NULL,
+ CIFSMaxBufSize, NULL,
&ret_data_len);
}
@@ -1690,6 +1805,8 @@ smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon,
/* GMT Token is @GMT-YYYY.MM.DD-HH.MM.SS Unicode which is 48 bytes + null */
#define GMT_TOKEN_SIZE 50
+#define MIN_SNAPSHOT_ARRAY_SIZE 16 /* See MS-SMB2 section 3.3.5.15.1 */
+
/*
* Input buffer contains (empty) struct smb_snapshot array with size filled in
* For output see struct SRV_SNAPSHOT_ARRAY in MS-SMB2 section 2.2.32.2
@@ -1701,13 +1818,29 @@ smb3_enum_snapshots(const unsigned int xid, struct cifs_tcon *tcon,
char *retbuf = NULL;
unsigned int ret_data_len = 0;
int rc;
+ u32 max_response_size;
struct smb_snapshot_array snapshot_in;
+ if (get_user(ret_data_len, (unsigned int __user *)ioc_buf))
+ return -EFAULT;
+
+ /*
+ * Note that for snapshot queries that servers like Azure expect that
+ * the first query be minimal size (and just used to get the number/size
+ * of previous versions) so response size must be specified as EXACTLY
+ * sizeof(struct snapshot_array) which is 16 when rounded up to multiple
+ * of eight bytes.
+ */
+ if (ret_data_len == 0)
+ max_response_size = MIN_SNAPSHOT_ARRAY_SIZE;
+ else
+ max_response_size = CIFSMaxBufSize;
+
rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid,
FSCTL_SRV_ENUMERATE_SNAPSHOTS,
true /* is_fsctl */,
- NULL, 0 /* no input data */,
+ NULL, 0 /* no input data */, max_response_size,
(char **)&retbuf,
&ret_data_len);
cifs_dbg(FYI, "enum snaphots ioctl returned %d and ret buflen is %d\n",
@@ -2185,7 +2318,7 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses,
rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
FSCTL_DFS_GET_REFERRALS,
true /* is_fsctl */,
- (char *)dfs_req, dfs_req_size,
+ (char *)dfs_req, dfs_req_size, CIFSMaxBufSize,
(char **)&dfs_rsp, &dfs_rsp_size);
} while (rc == -EAGAIN);
@@ -2256,6 +2389,8 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, &err_iov,
&resp_buftype);
+ if (!rc)
+ SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
if (!rc || !err_iov.iov_base) {
rc = -ENOENT;
goto free_path;
@@ -2472,22 +2607,38 @@ get_smb2_acl(struct cifs_sb_info *cifs_sb,
static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
loff_t offset, loff_t len, bool keep_size)
{
+ struct cifs_ses *ses = tcon->ses;
struct inode *inode;
struct cifsInodeInfo *cifsi;
struct cifsFileInfo *cfile = file->private_data;
struct file_zero_data_information fsctl_buf;
+ struct smb_rqst rqst[2];
+ int resp_buftype[2];
+ struct kvec rsp_iov[2];
+ struct kvec io_iov[SMB2_IOCTL_IOV_SIZE];
+ struct kvec si_iov[1];
+ unsigned int size[1];
+ void *data[1];
long rc;
unsigned int xid;
+ int num = 0, flags = 0;
+ __le64 eof;
xid = get_xid();
inode = d_inode(cfile->dentry);
cifsi = CIFS_I(inode);
+ trace_smb3_zero_enter(xid, cfile->fid.persistent_fid, tcon->tid,
+ ses->Suid, offset, len);
+
+
/* if file not oplocked can't be sure whether asking to extend size */
if (!CIFS_CACHE_READ(cifsi))
if (keep_size == false) {
rc = -EOPNOTSUPP;
+ trace_smb3_zero_err(xid, cfile->fid.persistent_fid,
+ tcon->tid, ses->Suid, offset, len, rc);
free_xid(xid);
return rc;
}
@@ -2498,33 +2649,74 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
*/
if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE)) {
rc = -EOPNOTSUPP;
+ trace_smb3_zero_err(xid, cfile->fid.persistent_fid, tcon->tid,
+ ses->Suid, offset, len, rc);
free_xid(xid);
return rc;
}
- /*
- * need to make sure we are not asked to extend the file since the SMB3
- * fsctl does not change the file size. In the future we could change
- * this to zero the first part of the range then set the file size
- * which for a non sparse file would zero the newly extended range
- */
- if (keep_size == false)
- if (i_size_read(inode) < offset + len) {
- rc = -EOPNOTSUPP;
- free_xid(xid);
- return rc;
- }
-
cifs_dbg(FYI, "offset %lld len %lld", offset, len);
fsctl_buf.FileOffset = cpu_to_le64(offset);
fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len);
- rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
- cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA,
- true /* is_fctl */, (char *)&fsctl_buf,
- sizeof(struct file_zero_data_information), NULL, NULL);
+ if (smb3_encryption_required(tcon))
+ flags |= CIFS_TRANSFORM_REQ;
+
+ memset(rqst, 0, sizeof(rqst));
+ resp_buftype[0] = resp_buftype[1] = CIFS_NO_BUFFER;
+ memset(rsp_iov, 0, sizeof(rsp_iov));
+
+
+ memset(&io_iov, 0, sizeof(io_iov));
+ rqst[num].rq_iov = io_iov;
+ rqst[num].rq_nvec = SMB2_IOCTL_IOV_SIZE;
+ rc = SMB2_ioctl_init(tcon, &rqst[num++], cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA,
+ true /* is_fctl */, (char *)&fsctl_buf,
+ sizeof(struct file_zero_data_information),
+ CIFSMaxBufSize);
+ if (rc)
+ goto zero_range_exit;
+
+ /*
+ * do we also need to change the size of the file?
+ */
+ if (keep_size == false && i_size_read(inode) < offset + len) {
+ smb2_set_next_command(tcon, &rqst[0]);
+
+ memset(&si_iov, 0, sizeof(si_iov));
+ rqst[num].rq_iov = si_iov;
+ rqst[num].rq_nvec = 1;
+
+ eof = cpu_to_le64(offset + len);
+ size[0] = 8; /* sizeof __le64 */
+ data[0] = &eof;
+
+ rc = SMB2_set_info_init(tcon, &rqst[num++],
+ cfile->fid.persistent_fid,
+ cfile->fid.persistent_fid,
+ current->tgid,
+ FILE_END_OF_FILE_INFORMATION,
+ SMB2_O_INFO_FILE, 0, data, size);
+ smb2_set_related(&rqst[1]);
+ }
+
+ rc = compound_send_recv(xid, ses, flags, num, rqst,
+ resp_buftype, rsp_iov);
+
+ zero_range_exit:
+ SMB2_ioctl_free(&rqst[0]);
+ SMB2_set_info_free(&rqst[1]);
+ free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
+ free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
free_xid(xid);
+ if (rc)
+ trace_smb3_zero_err(xid, cfile->fid.persistent_fid, tcon->tid,
+ ses->Suid, offset, len, rc);
+ else
+ trace_smb3_zero_done(xid, cfile->fid.persistent_fid, tcon->tid,
+ ses->Suid, offset, len);
return rc;
}
@@ -2560,7 +2752,8 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA,
true /* is_fctl */, (char *)&fsctl_buf,
- sizeof(struct file_zero_data_information), NULL, NULL);
+ sizeof(struct file_zero_data_information),
+ CIFSMaxBufSize, NULL, NULL);
free_xid(xid);
return rc;
}
@@ -2573,15 +2766,20 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
struct cifsFileInfo *cfile = file->private_data;
long rc = -EOPNOTSUPP;
unsigned int xid;
+ __le64 eof;
xid = get_xid();
inode = d_inode(cfile->dentry);
cifsi = CIFS_I(inode);
+ trace_smb3_falloc_enter(xid, cfile->fid.persistent_fid, tcon->tid,
+ tcon->ses->Suid, off, len);
/* if file not oplocked can't be sure whether asking to extend size */
if (!CIFS_CACHE_READ(cifsi))
if (keep_size == false) {
+ trace_smb3_falloc_err(xid, cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid, off, len, rc);
free_xid(xid);
return rc;
}
@@ -2601,6 +2799,12 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
/* BB: in future add else clause to extend file */
else
rc = -EOPNOTSUPP;
+ if (rc)
+ trace_smb3_falloc_err(xid, cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid, off, len, rc);
+ else
+ trace_smb3_falloc_done(xid, cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid, off, len);
free_xid(xid);
return rc;
}
@@ -2616,14 +2820,31 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
*/
if ((off > 8192) || (off + len + 8192 < i_size_read(inode))) {
rc = -EOPNOTSUPP;
+ trace_smb3_falloc_err(xid, cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid, off, len, rc);
free_xid(xid);
return rc;
}
- rc = smb2_set_sparse(xid, tcon, cfile, inode, false);
+ smb2_set_sparse(xid, tcon, cfile, inode, false);
+ rc = 0;
+ } else {
+ smb2_set_sparse(xid, tcon, cfile, inode, false);
+ rc = 0;
+ if (i_size_read(inode) < off + len) {
+ eof = cpu_to_le64(off + len);
+ rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid, cfile->pid,
+ &eof);
+ }
}
- /* BB: else ... in future add code to extend file and set sparse */
+ if (rc)
+ trace_smb3_falloc_err(xid, cfile->fid.persistent_fid, tcon->tid,
+ tcon->ses->Suid, off, len, rc);
+ else
+ trace_smb3_falloc_done(xid, cfile->fid.persistent_fid, tcon->tid,
+ tcon->ses->Suid, off, len);
free_xid(xid);
return rc;
@@ -3604,6 +3825,104 @@ smb2_next_header(char *buf)
return le32_to_cpu(hdr->NextCommand);
}
+static int
+smb2_make_node(unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ char *full_path, umode_t mode, dev_t dev)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ int rc = -EPERM;
+ int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL;
+ FILE_ALL_INFO *buf = NULL;
+ struct cifs_io_parms io_parms;
+ __u32 oplock = 0;
+ struct cifs_fid fid;
+ struct cifs_open_parms oparms;
+ unsigned int bytes_written;
+ struct win_dev *pdev;
+ struct kvec iov[2];
+
+ /*
+ * Check if mounted with mount parm 'sfu' mount parm.
+ * SFU emulation should work with all servers, but only
+ * supports block and char device (no socket & fifo),
+ * and was used by default in earlier versions of Windows
+ */
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
+ goto out;
+
+ /*
+ * TODO: Add ability to create instead via reparse point. Windows (e.g.
+ * their current NFS server) uses this approach to expose special files
+ * over SMB2/SMB3 and Samba will do this with SMB3.1.1 POSIX Extensions
+ */
+
+ if (!S_ISCHR(mode) && !S_ISBLK(mode))
+ goto out;
+
+ cifs_dbg(FYI, "sfu compat create special file\n");
+
+ buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+ if (buf == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ if (backup_cred(cifs_sb))
+ create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+ oparms.tcon = tcon;
+ oparms.cifs_sb = cifs_sb;
+ oparms.desired_access = GENERIC_WRITE;
+ oparms.create_options = create_options;
+ oparms.disposition = FILE_CREATE;
+ oparms.path = full_path;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ if (tcon->ses->server->oplocks)
+ oplock = REQ_OPLOCK;
+ else
+ oplock = 0;
+ rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf);
+ if (rc)
+ goto out;
+
+ /*
+ * BB Do not bother to decode buf since no local inode yet to put
+ * timestamps in, but we can reuse it safely.
+ */
+
+ pdev = (struct win_dev *)buf;
+ io_parms.pid = current->tgid;
+ io_parms.tcon = tcon;
+ io_parms.offset = 0;
+ io_parms.length = sizeof(struct win_dev);
+ iov[1].iov_base = buf;
+ iov[1].iov_len = sizeof(struct win_dev);
+ if (S_ISCHR(mode)) {
+ memcpy(pdev->type, "IntxCHR", 8);
+ pdev->major = cpu_to_le64(MAJOR(dev));
+ pdev->minor = cpu_to_le64(MINOR(dev));
+ rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
+ &bytes_written, iov, 1);
+ } else if (S_ISBLK(mode)) {
+ memcpy(pdev->type, "IntxBLK", 8);
+ pdev->major = cpu_to_le64(MAJOR(dev));
+ pdev->minor = cpu_to_le64(MINOR(dev));
+ rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
+ &bytes_written, iov, 1);
+ }
+ tcon->ses->server->ops->close(xid, tcon, &fid);
+ d_drop(dentry);
+
+ /* FIXME: add code here to set EAs */
+out:
+ kfree(buf);
+ return rc;
+}
+
+
struct smb_version_operations smb20_operations = {
.compare_fids = smb2_compare_fids,
.setup_request = smb2_setup_request,
@@ -3698,6 +4017,7 @@ struct smb_version_operations smb20_operations = {
#endif /* CIFS_ACL */
.next_header = smb2_next_header,
.ioctl_query_info = smb2_ioctl_query_info,
+ .make_node = smb2_make_node,
};
struct smb_version_operations smb21_operations = {
@@ -3796,6 +4116,7 @@ struct smb_version_operations smb21_operations = {
#endif /* CIFS_ACL */
.next_header = smb2_next_header,
.ioctl_query_info = smb2_ioctl_query_info,
+ .make_node = smb2_make_node,
};
struct smb_version_operations smb30_operations = {
@@ -3903,6 +4224,7 @@ struct smb_version_operations smb30_operations = {
#endif /* CIFS_ACL */
.next_header = smb2_next_header,
.ioctl_query_info = smb2_ioctl_query_info,
+ .make_node = smb2_make_node,
};
struct smb_version_operations smb311_operations = {
@@ -4011,6 +4333,7 @@ struct smb_version_operations smb311_operations = {
#endif /* CIFS_ACL */
.next_header = smb2_next_header,
.ioctl_query_info = smb2_ioctl_query_info,
+ .make_node = smb2_make_node,
};
struct smb_version_values smb20_values = {
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 60fbe306f604..b8f7262ac354 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -832,8 +832,11 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
} else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID)) {
/* ops set to 3.0 by default for default so update */
ses->server->ops = &smb21_operations;
- } else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID))
+ ses->server->vals = &smb21_values;
+ } else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID)) {
ses->server->ops = &smb311_operations;
+ ses->server->vals = &smb311_values;
+ }
} else if (le16_to_cpu(rsp->DialectRevision) !=
ses->server->vals->protocol_id) {
/* if requested single dialect ensure returned dialect matched */
@@ -1002,7 +1005,8 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */,
- (char *)pneg_inbuf, inbuflen, (char **)&pneg_rsp, &rsplen);
+ (char *)pneg_inbuf, inbuflen, CIFSMaxBufSize,
+ (char **)&pneg_rsp, &rsplen);
if (rc == -EOPNOTSUPP) {
/*
* Old Windows versions or Netapp SMB server can return
@@ -1628,9 +1632,16 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
iov[1].iov_base = unc_path;
iov[1].iov_len = unc_path_len;
- /* 3.11 tcon req must be signed if not encrypted. See MS-SMB2 3.2.4.1.1 */
+ /*
+ * 3.11 tcon req must be signed if not encrypted. See MS-SMB2 3.2.4.1.1
+ * unless it is guest or anonymous user. See MS-SMB2 3.2.5.3.1
+ * (Samba servers don't always set the flag so also check if null user)
+ */
if ((ses->server->dialect == SMB311_PROT_ID) &&
- !smb3_encryption_required(tcon))
+ !smb3_encryption_required(tcon) &&
+ !(ses->session_flags &
+ (SMB2_SESSION_FLAG_IS_GUEST|SMB2_SESSION_FLAG_IS_NULL)) &&
+ ((ses->user_name != NULL) || (ses->sectype == Kerberos)))
req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
memset(&rqst, 0, sizeof(struct smb_rqst));
@@ -1797,9 +1808,10 @@ create_reconnect_durable_buf(struct cifs_fid *fid)
return buf;
}
-static __u8
-parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp,
- unsigned int *epoch, char *lease_key)
+__u8
+smb2_parse_lease_state(struct TCP_Server_Info *server,
+ struct smb2_create_rsp *rsp,
+ unsigned int *epoch, char *lease_key)
{
char *data_offset;
struct create_context *cc;
@@ -1850,8 +1862,9 @@ add_lease_context(struct TCP_Server_Info *server, struct kvec *iov,
}
static struct create_durable_v2 *
-create_durable_v2_buf(struct cifs_fid *pfid)
+create_durable_v2_buf(struct cifs_open_parms *oparms)
{
+ struct cifs_fid *pfid = oparms->fid;
struct create_durable_v2 *buf;
buf = kzalloc(sizeof(struct create_durable_v2), GFP_KERNEL);
@@ -1865,7 +1878,14 @@ create_durable_v2_buf(struct cifs_fid *pfid)
(struct create_durable_v2, Name));
buf->ccontext.NameLength = cpu_to_le16(4);
- buf->dcontext.Timeout = 0; /* Should this be configurable by workload */
+ /*
+ * NB: Handle timeout defaults to 0, which allows server to choose
+ * (most servers default to 120 seconds) and most clients default to 0.
+ * This can be overridden at mount ("handletimeout=") if the user wants
+ * a different persistent (or resilient) handle timeout for all opens
+ * opens on a particular SMB3 mount.
+ */
+ buf->dcontext.Timeout = cpu_to_le32(oparms->tcon->handle_timeout);
buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT);
generate_random_uuid(buf->dcontext.CreateGuid);
memcpy(pfid->create_guid, buf->dcontext.CreateGuid, 16);
@@ -1918,7 +1938,7 @@ add_durable_v2_context(struct kvec *iov, unsigned int *num_iovec,
struct smb2_create_req *req = iov[0].iov_base;
unsigned int num = *num_iovec;
- iov[num].iov_base = create_durable_v2_buf(oparms->fid);
+ iov[num].iov_base = create_durable_v2_buf(oparms);
if (iov[num].iov_base == NULL)
return -ENOMEM;
iov[num].iov_len = sizeof(struct create_durable_v2);
@@ -2456,8 +2476,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
}
if (rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE)
- *oplock = parse_lease_state(server, rsp, &oparms->fid->epoch,
- oparms->fid->lease_key);
+ *oplock = smb2_parse_lease_state(server, rsp,
+ &oparms->fid->epoch,
+ oparms->fid->lease_key);
else
*oplock = rsp->OplockLevel;
creat_exit:
@@ -2466,113 +2487,138 @@ creat_exit:
return rc;
}
-/*
- * SMB2 IOCTL is used for both IOCTLs and FSCTLs
- */
int
-SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
- u64 volatile_fid, u32 opcode, bool is_fsctl,
- char *in_data, u32 indatalen,
- char **out_data, u32 *plen /* returned data len */)
+SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
+ u64 persistent_fid, u64 volatile_fid, u32 opcode,
+ bool is_fsctl, char *in_data, u32 indatalen,
+ __u32 max_response_size)
{
- struct smb_rqst rqst;
struct smb2_ioctl_req *req;
- struct smb2_ioctl_rsp *rsp;
- struct cifs_ses *ses;
- struct kvec iov[2];
- struct kvec rsp_iov;
- int resp_buftype;
- int n_iov;
- int rc = 0;
- int flags = 0;
+ struct kvec *iov = rqst->rq_iov;
unsigned int total_len;
-
- cifs_dbg(FYI, "SMB2 IOCTL\n");
-
- if (out_data != NULL)
- *out_data = NULL;
-
- /* zero out returned data len, in case of error */
- if (plen)
- *plen = 0;
-
- if (tcon)
- ses = tcon->ses;
- else
- return -EIO;
-
- if (!ses || !(ses->server))
- return -EIO;
+ int rc;
rc = smb2_plain_req_init(SMB2_IOCTL, tcon, (void **) &req, &total_len);
if (rc)
return rc;
- if (smb3_encryption_required(tcon))
- flags |= CIFS_TRANSFORM_REQ;
-
req->CtlCode = cpu_to_le32(opcode);
req->PersistentFileId = persistent_fid;
req->VolatileFileId = volatile_fid;
+ iov[0].iov_base = (char *)req;
+ /*
+ * If no input data, the size of ioctl struct in
+ * protocol spec still includes a 1 byte data buffer,
+ * but if input data passed to ioctl, we do not
+ * want to double count this, so we do not send
+ * the dummy one byte of data in iovec[0] if sending
+ * input data (in iovec[1]).
+ */
if (indatalen) {
req->InputCount = cpu_to_le32(indatalen);
/* do not set InputOffset if no input data */
req->InputOffset =
cpu_to_le32(offsetof(struct smb2_ioctl_req, Buffer));
+ rqst->rq_nvec = 2;
+ iov[0].iov_len = total_len - 1;
iov[1].iov_base = in_data;
iov[1].iov_len = indatalen;
- n_iov = 2;
- } else
- n_iov = 1;
+ } else {
+ rqst->rq_nvec = 1;
+ iov[0].iov_len = total_len;
+ }
req->OutputOffset = 0;
req->OutputCount = 0; /* MBZ */
/*
- * Could increase MaxOutputResponse, but that would require more
- * than one credit. Windows typically sets this smaller, but for some
+ * In most cases max_response_size is set to 16K (CIFSMaxBufSize)
+ * We Could increase default MaxOutputResponse, but that could require
+ * more credits. Windows typically sets this smaller, but for some
* ioctls it may be useful to allow server to send more. No point
* limiting what the server can send as long as fits in one credit
- * Unfortunately - we can not handle more than CIFS_MAX_MSG_SIZE
- * (by default, note that it can be overridden to make max larger)
- * in responses (except for read responses which can be bigger.
- * We may want to bump this limit up
+ * We can not handle more than CIFS_MAX_BUF_SIZE yet but may want
+ * to increase this limit up in the future.
+ * Note that for snapshot queries that servers like Azure expect that
+ * the first query be minimal size (and just used to get the number/size
+ * of previous versions) so response size must be specified as EXACTLY
+ * sizeof(struct snapshot_array) which is 16 when rounded up to multiple
+ * of eight bytes. Currently that is the only case where we set max
+ * response size smaller.
*/
- req->MaxOutputResponse = cpu_to_le32(CIFSMaxBufSize);
+ req->MaxOutputResponse = cpu_to_le32(max_response_size);
if (is_fsctl)
req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL);
else
req->Flags = 0;
- iov[0].iov_base = (char *)req;
-
- /*
- * If no input data, the size of ioctl struct in
- * protocol spec still includes a 1 byte data buffer,
- * but if input data passed to ioctl, we do not
- * want to double count this, so we do not send
- * the dummy one byte of data in iovec[0] if sending
- * input data (in iovec[1]).
- */
-
- if (indatalen) {
- iov[0].iov_len = total_len - 1;
- } else
- iov[0].iov_len = total_len;
-
/* validate negotiate request must be signed - see MS-SMB2 3.2.5.5 */
if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO)
req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
+ return 0;
+}
+
+void
+SMB2_ioctl_free(struct smb_rqst *rqst)
+{
+ if (rqst && rqst->rq_iov)
+ cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */
+}
+
+
+/*
+ * SMB2 IOCTL is used for both IOCTLs and FSCTLs
+ */
+int
+SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
+ u64 volatile_fid, u32 opcode, bool is_fsctl,
+ char *in_data, u32 indatalen, u32 max_out_data_len,
+ char **out_data, u32 *plen /* returned data len */)
+{
+ struct smb_rqst rqst;
+ struct smb2_ioctl_rsp *rsp = NULL;
+ struct cifs_ses *ses;
+ struct kvec iov[SMB2_IOCTL_IOV_SIZE];
+ struct kvec rsp_iov = {NULL, 0};
+ int resp_buftype = CIFS_NO_BUFFER;
+ int rc = 0;
+ int flags = 0;
+
+ cifs_dbg(FYI, "SMB2 IOCTL\n");
+
+ if (out_data != NULL)
+ *out_data = NULL;
+
+ /* zero out returned data len, in case of error */
+ if (plen)
+ *plen = 0;
+
+ if (tcon)
+ ses = tcon->ses;
+ else
+ return -EIO;
+
+ if (!ses || !(ses->server))
+ return -EIO;
+
+ if (smb3_encryption_required(tcon))
+ flags |= CIFS_TRANSFORM_REQ;
+
memset(&rqst, 0, sizeof(struct smb_rqst));
+ memset(&iov, 0, sizeof(iov));
rqst.rq_iov = iov;
- rqst.rq_nvec = n_iov;
+ rqst.rq_nvec = SMB2_IOCTL_IOV_SIZE;
+
+ rc = SMB2_ioctl_init(tcon, &rqst, persistent_fid, volatile_fid, opcode,
+ is_fsctl, in_data, indatalen, max_out_data_len);
+ if (rc)
+ goto ioctl_exit;
rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags,
&rsp_iov);
- cifs_small_buf_release(req);
rsp = (struct smb2_ioctl_rsp *)rsp_iov.iov_base;
if (rc != 0)
@@ -2622,6 +2668,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
}
ioctl_exit:
+ SMB2_ioctl_free(&rqst);
free_rsp_buf(resp_buftype, rsp);
return rc;
}
@@ -2644,7 +2691,8 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid,
FSCTL_SET_COMPRESSION, true /* is_fsctl */,
(char *)&fsctl_input /* data input */,
- 2 /* in data len */, &ret_data /* out data */, NULL);
+ 2 /* in data len */, CIFSMaxBufSize /* max out data */,
+ &ret_data /* out data */, NULL);
cifs_dbg(FYI, "set compression rc %d\n", rc);
@@ -3403,8 +3451,6 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
rqst.rq_nvec = 1;
rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov);
- cifs_small_buf_release(req);
-
rsp = (struct smb2_read_rsp *)rsp_iov.iov_base;
if (rc) {
@@ -3426,6 +3472,8 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
io_parms->tcon->tid, ses->Suid,
io_parms->offset, io_parms->length);
+ cifs_small_buf_release(req);
+
*nbytes = le32_to_cpu(rsp->DataLength);
if ((*nbytes > CIFS_MAX_MSGSIZE) ||
(*nbytes > io_parms->length)) {
@@ -3724,7 +3772,6 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
rc = cifs_send_recv(xid, io_parms->tcon->ses, &rqst,
&resp_buftype, flags, &rsp_iov);
- cifs_small_buf_release(req);
rsp = (struct smb2_write_rsp *)rsp_iov.iov_base;
if (rc) {
@@ -3742,6 +3789,7 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
io_parms->offset, *nbytes);
}
+ cifs_small_buf_release(req);
free_rsp_buf(resp_buftype, rsp);
return rc;
}
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 0bd4d4802701..ee8977688e21 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -959,6 +959,13 @@ struct duplicate_extents_to_file {
__le64 ByteCount; /* Bytes to be copied */
} __packed;
+/*
+ * Maximum number of iovs we need for an ioctl request.
+ * [0] : struct smb2_ioctl_req
+ * [1] : in_data
+ */
+#define SMB2_IOCTL_IOV_SIZE 2
+
struct smb2_ioctl_req {
struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 57 */
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 87733b27a65f..52df125e9189 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -142,8 +142,13 @@ extern int SMB2_open_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
extern void SMB2_open_free(struct smb_rqst *rqst);
extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, u32 opcode,
- bool is_fsctl, char *in_data, u32 indatalen,
+ bool is_fsctl, char *in_data, u32 indatalen, u32 maxoutlen,
char **out_data, u32 *plen /* returned data len */);
+extern int SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
+ u64 persistent_fid, u64 volatile_fid, u32 opcode,
+ bool is_fsctl, char *in_data, u32 indatalen,
+ __u32 max_response_size);
+extern void SMB2_ioctl_free(struct smb_rqst *rqst);
extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_file_id, u64 volatile_file_id);
extern int SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon,
@@ -223,6 +228,9 @@ extern int smb3_validate_negotiate(const unsigned int, struct cifs_tcon *);
extern enum securityEnum smb2_select_sectype(struct TCP_Server_Info *,
enum securityEnum);
+extern __u8 smb2_parse_lease_state(struct TCP_Server_Info *server,
+ struct smb2_create_rsp *rsp,
+ unsigned int *epoch, char *lease_key);
extern int smb3_encryption_required(const struct cifs_tcon *tcon);
extern int smb2_validate_iov(unsigned int offset, unsigned int buffer_length,
struct kvec *iov, unsigned int min_buf_size);
diff --git a/fs/cifs/smb2status.h b/fs/cifs/smb2status.h
index 3d5f62150de4..447c0c6e4c64 100644
--- a/fs/cifs/smb2status.h
+++ b/fs/cifs/smb2status.h
@@ -30,9 +30,9 @@
*/
#define STATUS_SEVERITY_SUCCESS __constant_cpu_to_le32(0x0000)
-#define STATUS_SEVERITY_INFORMATIONAL __constanst_cpu_to_le32(0x0001)
-#define STATUS_SEVERITY_WARNING __constanst_cpu_to_le32(0x0002)
-#define STATUS_SEVERITY_ERROR __constanst_cpu_to_le32(0x0003)
+#define STATUS_SEVERITY_INFORMATIONAL cpu_to_le32(0x0001)
+#define STATUS_SEVERITY_WARNING cpu_to_le32(0x0002)
+#define STATUS_SEVERITY_ERROR cpu_to_le32(0x0003)
struct ntstatus {
/* Facility is the high 12 bits of the following field */
diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h
index d8b049afa606..99c4d799c24b 100644
--- a/fs/cifs/trace.h
+++ b/fs/cifs/trace.h
@@ -59,6 +59,8 @@ DEFINE_EVENT(smb3_rw_err_class, smb3_##name, \
DEFINE_SMB3_RW_ERR_EVENT(write_err);
DEFINE_SMB3_RW_ERR_EVENT(read_err);
DEFINE_SMB3_RW_ERR_EVENT(query_dir_err);
+DEFINE_SMB3_RW_ERR_EVENT(zero_err);
+DEFINE_SMB3_RW_ERR_EVENT(falloc_err);
/* For logging successful read or write */
@@ -104,9 +106,13 @@ DEFINE_EVENT(smb3_rw_done_class, smb3_##name, \
DEFINE_SMB3_RW_DONE_EVENT(write_enter);
DEFINE_SMB3_RW_DONE_EVENT(read_enter);
DEFINE_SMB3_RW_DONE_EVENT(query_dir_enter);
+DEFINE_SMB3_RW_DONE_EVENT(zero_enter);
+DEFINE_SMB3_RW_DONE_EVENT(falloc_enter);
DEFINE_SMB3_RW_DONE_EVENT(write_done);
DEFINE_SMB3_RW_DONE_EVENT(read_done);
DEFINE_SMB3_RW_DONE_EVENT(query_dir_done);
+DEFINE_SMB3_RW_DONE_EVENT(zero_done);
+DEFINE_SMB3_RW_DONE_EVENT(falloc_done);
/*
* For handle based calls other than read and write, and get/set info
@@ -242,6 +248,123 @@ DEFINE_SMB3_INF_ERR_EVENT(query_info_err);
DEFINE_SMB3_INF_ERR_EVENT(set_info_err);
DEFINE_SMB3_INF_ERR_EVENT(fsctl_err);
+DECLARE_EVENT_CLASS(smb3_inf_compound_enter_class,
+ TP_PROTO(unsigned int xid,
+ __u32 tid,
+ __u64 sesid,
+ const char *full_path),
+ TP_ARGS(xid, tid, sesid, full_path),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __string(path, full_path)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __assign_str(path, full_path);
+ ),
+ TP_printk("xid=%u sid=0x%llx tid=0x%x path=%s",
+ __entry->xid, __entry->sesid, __entry->tid,
+ __get_str(path))
+)
+
+#define DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(name) \
+DEFINE_EVENT(smb3_inf_compound_enter_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u32 tid, \
+ __u64 sesid, \
+ const char *full_path), \
+ TP_ARGS(xid, tid, sesid, full_path))
+
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(query_info_compound_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(hardlink_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rename_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rmdir_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_eof_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_info_compound_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter);
+
+
+DECLARE_EVENT_CLASS(smb3_inf_compound_done_class,
+ TP_PROTO(unsigned int xid,
+ __u32 tid,
+ __u64 sesid),
+ TP_ARGS(xid, tid, sesid),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ ),
+ TP_printk("xid=%u sid=0x%llx tid=0x%x",
+ __entry->xid, __entry->sesid, __entry->tid)
+)
+
+#define DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(name) \
+DEFINE_EVENT(smb3_inf_compound_done_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u32 tid, \
+ __u64 sesid), \
+ TP_ARGS(xid, tid, sesid))
+
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_info_compound_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(hardlink_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rename_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rmdir_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done);
+
+
+DECLARE_EVENT_CLASS(smb3_inf_compound_err_class,
+ TP_PROTO(unsigned int xid,
+ __u32 tid,
+ __u64 sesid,
+ int rc),
+ TP_ARGS(xid, tid, sesid, rc),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(int, rc)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->rc = rc;
+ ),
+ TP_printk("xid=%u sid=0x%llx tid=0x%x rc=%d",
+ __entry->xid, __entry->sesid, __entry->tid,
+ __entry->rc)
+)
+
+#define DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(name) \
+DEFINE_EVENT(smb3_inf_compound_err_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u32 tid, \
+ __u64 sesid, \
+ int rc), \
+ TP_ARGS(xid, tid, sesid, rc))
+
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_info_compound_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(hardlink_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rename_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rmdir_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err);
+
/*
* For logging SMB3 Status code and Command for responses which return errors
*/
@@ -426,19 +549,19 @@ DECLARE_EVENT_CLASS(smb3_tcon_class,
__field(unsigned int, xid)
__field(__u32, tid)
__field(__u64, sesid)
- __field(const char *, unc_name)
+ __string(name, unc_name)
__field(int, rc)
),
TP_fast_assign(
__entry->xid = xid;
__entry->tid = tid;
__entry->sesid = sesid;
- __entry->unc_name = unc_name;
+ __assign_str(name, unc_name);
__entry->rc = rc;
),
TP_printk("xid=%u sid=0x%llx tid=0x%x unc_name=%s rc=%d",
__entry->xid, __entry->sesid, __entry->tid,
- __entry->unc_name, __entry->rc)
+ __get_str(name), __entry->rc)
)
#define DEFINE_SMB3_TCON_EVENT(name) \
@@ -713,6 +836,7 @@ DEFINE_EVENT(smb3_credit_class, smb3_##name, \
TP_ARGS(currmid, hostname, credits))
DEFINE_SMB3_CREDIT_EVENT(reconnect_with_invalid_credits);
+DEFINE_SMB3_CREDIT_EVENT(credit_timeout);
#endif /* _CIFS_TRACE_H */
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 7ce8a585abd6..1de8e996e566 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -486,15 +486,31 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
}
static int
-wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
- int *credits, unsigned int *instance)
+wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits,
+ const int timeout, const int flags,
+ unsigned int *instance)
{
int rc;
+ int *credits;
+ int optype;
+ long int t;
+
+ if (timeout < 0)
+ t = MAX_JIFFY_OFFSET;
+ else
+ t = msecs_to_jiffies(timeout);
+
+ optype = flags & CIFS_OP_MASK;
*instance = 0;
+ credits = server->ops->get_credits_field(server, optype);
+ /* Since an echo is already inflight, no need to wait to send another */
+ if (*credits <= 0 && optype == CIFS_ECHO_OP)
+ return -EAGAIN;
+
spin_lock(&server->req_lock);
- if (timeout == CIFS_ASYNC_OP) {
+ if ((flags & CIFS_TIMEOUT_MASK) == CIFS_ASYNC_OP) {
/* oplock breaks must not be held up */
server->in_flight++;
*credits -= 1;
@@ -504,14 +520,21 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
}
while (1) {
- if (*credits <= 0) {
+ if (*credits < num_credits) {
spin_unlock(&server->req_lock);
cifs_num_waiters_inc(server);
- rc = wait_event_killable(server->request_q,
- has_credits(server, credits));
+ rc = wait_event_killable_timeout(server->request_q,
+ has_credits(server, credits, num_credits), t);
cifs_num_waiters_dec(server);
- if (rc)
- return rc;
+ if (!rc) {
+ trace_smb3_credit_timeout(server->CurrentMid,
+ server->hostname, num_credits);
+ cifs_dbg(VFS, "wait timed out after %d ms\n",
+ timeout);
+ return -ENOTSUPP;
+ }
+ if (rc == -ERESTARTSYS)
+ return -ERESTARTSYS;
spin_lock(&server->req_lock);
} else {
if (server->tcpStatus == CifsExiting) {
@@ -520,14 +543,52 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
}
/*
+ * For normal commands, reserve the last MAX_COMPOUND
+ * credits to compound requests.
+ * Otherwise these compounds could be permanently
+ * starved for credits by single-credit requests.
+ *
+ * To prevent spinning CPU, block this thread until
+ * there are >MAX_COMPOUND credits available.
+ * But only do this is we already have a lot of
+ * credits in flight to avoid triggering this check
+ * for servers that are slow to hand out credits on
+ * new sessions.
+ */
+ if (!optype && num_credits == 1 &&
+ server->in_flight > 2 * MAX_COMPOUND &&
+ *credits <= MAX_COMPOUND) {
+ spin_unlock(&server->req_lock);
+ cifs_num_waiters_inc(server);
+ rc = wait_event_killable_timeout(
+ server->request_q,
+ has_credits(server, credits,
+ MAX_COMPOUND + 1),
+ t);
+ cifs_num_waiters_dec(server);
+ if (!rc) {
+ trace_smb3_credit_timeout(
+ server->CurrentMid,
+ server->hostname, num_credits);
+ cifs_dbg(VFS, "wait timed out after %d ms\n",
+ timeout);
+ return -ENOTSUPP;
+ }
+ if (rc == -ERESTARTSYS)
+ return -ERESTARTSYS;
+ spin_lock(&server->req_lock);
+ continue;
+ }
+
+ /*
* Can not count locking commands against total
* as they are allowed to block on server.
*/
/* update # of requests on the wire to server */
- if (timeout != CIFS_BLOCKING_OP) {
- *credits -= 1;
- server->in_flight++;
+ if ((flags & CIFS_TIMEOUT_MASK) != CIFS_BLOCKING_OP) {
+ *credits -= num_credits;
+ server->in_flight += num_credits;
*instance = server->reconnect_instance;
}
spin_unlock(&server->req_lock);
@@ -538,16 +599,36 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
}
static int
-wait_for_free_request(struct TCP_Server_Info *server, const int timeout,
- const int optype, unsigned int *instance)
+wait_for_free_request(struct TCP_Server_Info *server, const int flags,
+ unsigned int *instance)
{
- int *val;
+ return wait_for_free_credits(server, 1, -1, flags,
+ instance);
+}
- val = server->ops->get_credits_field(server, optype);
- /* Since an echo is already inflight, no need to wait to send another */
- if (*val <= 0 && optype == CIFS_ECHO_OP)
- return -EAGAIN;
- return wait_for_free_credits(server, timeout, val, instance);
+static int
+wait_for_compound_request(struct TCP_Server_Info *server, int num,
+ const int flags, unsigned int *instance)
+{
+ int *credits;
+
+ credits = server->ops->get_credits_field(server, flags & CIFS_OP_MASK);
+
+ spin_lock(&server->req_lock);
+ if (*credits < num) {
+ /*
+ * Return immediately if not too many requests in flight since
+ * we will likely be stuck on waiting for credits.
+ */
+ if (server->in_flight < num - *credits) {
+ spin_unlock(&server->req_lock);
+ return -ENOTSUPP;
+ }
+ }
+ spin_unlock(&server->req_lock);
+
+ return wait_for_free_credits(server, num, 60000, flags,
+ instance);
}
int
@@ -646,16 +727,16 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
mid_handle_t *handle, void *cbdata, const int flags,
const struct cifs_credits *exist_credits)
{
- int rc, timeout, optype;
+ int rc;
struct mid_q_entry *mid;
struct cifs_credits credits = { .value = 0, .instance = 0 };
unsigned int instance;
+ int optype;
- timeout = flags & CIFS_TIMEOUT_MASK;
optype = flags & CIFS_OP_MASK;
if ((flags & CIFS_HAS_CREDITS) == 0) {
- rc = wait_for_free_request(server, timeout, optype, &instance);
+ rc = wait_for_free_request(server, flags, &instance);
if (rc)
return rc;
credits.value = 1;
@@ -871,18 +952,15 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
const int flags, const int num_rqst, struct smb_rqst *rqst,
int *resp_buf_type, struct kvec *resp_iov)
{
- int i, j, rc = 0;
- int timeout, optype;
+ int i, j, optype, rc = 0;
struct mid_q_entry *midQ[MAX_COMPOUND];
bool cancelled_mid[MAX_COMPOUND] = {false};
struct cifs_credits credits[MAX_COMPOUND] = {
{ .value = 0, .instance = 0 }
};
unsigned int instance;
- unsigned int first_instance = 0;
char *buf;
- timeout = flags & CIFS_TIMEOUT_MASK;
optype = flags & CIFS_OP_MASK;
for (i = 0; i < num_rqst; i++)
@@ -896,81 +974,24 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
if (ses->server->tcpStatus == CifsExiting)
return -ENOENT;
- spin_lock(&ses->server->req_lock);
- if (ses->server->credits < num_rqst) {
- /*
- * Return immediately if not too many requests in flight since
- * we will likely be stuck on waiting for credits.
- */
- if (ses->server->in_flight < num_rqst - ses->server->credits) {
- spin_unlock(&ses->server->req_lock);
- return -ENOTSUPP;
- }
- } else {
- /* enough credits to send the whole compounded request */
- ses->server->credits -= num_rqst;
- ses->server->in_flight += num_rqst;
- first_instance = ses->server->reconnect_instance;
- }
- spin_unlock(&ses->server->req_lock);
-
- if (first_instance) {
- cifs_dbg(FYI, "Acquired %d credits at once\n", num_rqst);
- for (i = 0; i < num_rqst; i++) {
- credits[i].value = 1;
- credits[i].instance = first_instance;
- }
- goto setup_rqsts;
- }
-
/*
- * There are not enough credits to send the whole compound request but
- * there are requests in flight that may bring credits from the server.
+ * Wait for all the requests to become available.
* This approach still leaves the possibility to be stuck waiting for
* credits if the server doesn't grant credits to the outstanding
- * requests. This should be fixed by returning immediately and letting
- * a caller fallback to sequential commands instead of compounding.
- * Ensure we obtain 1 credit per request in the compound chain.
+ * requests and if the client is completely idle, not generating any
+ * other requests.
+ * This can be handled by the eventual session reconnect.
*/
- for (i = 0; i < num_rqst; i++) {
- rc = wait_for_free_request(ses->server, timeout, optype,
- &instance);
-
- if (rc == 0) {
- credits[i].value = 1;
- credits[i].instance = instance;
- /*
- * All parts of the compound chain must get credits from
- * the same session, otherwise we may end up using more
- * credits than the server granted. If there were
- * reconnects in between, return -EAGAIN and let callers
- * handle it.
- */
- if (i == 0)
- first_instance = instance;
- else if (first_instance != instance) {
- i++;
- rc = -EAGAIN;
- }
- }
+ rc = wait_for_compound_request(ses->server, num_rqst, flags,
+ &instance);
+ if (rc)
+ return rc;
- if (rc) {
- /*
- * We haven't sent an SMB packet to the server yet but
- * we already obtained credits for i requests in the
- * compound chain - need to return those credits back
- * for future use. Note that we need to call add_credits
- * multiple times to match the way we obtained credits
- * in the first place and to account for in flight
- * requests correctly.
- */
- for (j = 0; j < i; j++)
- add_credits(ses->server, &credits[j], optype);
- return rc;
- }
+ for (i = 0; i < num_rqst; i++) {
+ credits[i].value = 1;
+ credits[i].instance = instance;
}
-setup_rqsts:
/*
* Make sure that we sign in the same order that we send on this socket
* and avoid races inside tcp sendmsg code that could cause corruption
@@ -981,14 +1002,12 @@ setup_rqsts:
/*
* All the parts of the compound chain belong obtained credits from the
- * same session (see the appropriate checks above). In the same time
- * there might be reconnects after those checks but before we acquired
- * the srv_mutex. We can not use credits obtained from the previous
+ * same session. We can not use credits obtained from the previous
* session to send this request. Check if there were reconnects after
* we obtained credits and return -EAGAIN in such cases to let callers
* handle it.
*/
- if (first_instance != ses->server->reconnect_instance) {
+ if (instance != ses->server->reconnect_instance) {
mutex_unlock(&ses->server->srv_mutex);
for (j = 0; j < num_rqst; j++)
add_credits(ses->server, &credits[j], optype);
@@ -1057,7 +1076,7 @@ setup_rqsts:
smb311_update_preauth_hash(ses, rqst[0].rq_iov,
rqst[0].rq_nvec);
- if (timeout == CIFS_ASYNC_OP)
+ if ((flags & CIFS_TIMEOUT_MASK) == CIFS_ASYNC_OP)
goto out;
for (i = 0; i < num_rqst; i++) {
@@ -1194,7 +1213,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
int
SendReceive(const unsigned int xid, struct cifs_ses *ses,
struct smb_hdr *in_buf, struct smb_hdr *out_buf,
- int *pbytes_returned, const int timeout)
+ int *pbytes_returned, const int flags)
{
int rc = 0;
struct mid_q_entry *midQ;
@@ -1225,7 +1244,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
return -EIO;
}
- rc = wait_for_free_request(ses->server, timeout, 0, &credits.instance);
+ rc = wait_for_free_request(ses->server, flags, &credits.instance);
if (rc)
return rc;
@@ -1264,7 +1283,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
if (rc < 0)
goto out;
- if (timeout == CIFS_ASYNC_OP)
+ if ((flags & CIFS_TIMEOUT_MASK) == CIFS_ASYNC_OP)
goto out;
rc = wait_for_response(ses->server, midQ);
@@ -1367,8 +1386,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
return -EIO;
}
- rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP, 0,
- &instance);
+ rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP, &instance);
if (rc)
return rc;
diff --git a/fs/dax.c b/fs/dax.c
index ca0671d55aa6..e5e54da1715f 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -33,6 +33,7 @@
#include <linux/sizes.h>
#include <linux/mmu_notifier.h>
#include <linux/iomap.h>
+#include <asm/pgalloc.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
@@ -1407,7 +1408,9 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
unsigned long pmd_addr = vmf->address & PMD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
struct inode *inode = mapping->host;
+ pgtable_t pgtable = NULL;
struct page *zero_page;
spinlock_t *ptl;
pmd_t pmd_entry;
@@ -1422,12 +1425,22 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
*entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
DAX_PMD | DAX_ZERO_PAGE, false);
+ if (arch_needs_pgtable_deposit()) {
+ pgtable = pte_alloc_one(vma->vm_mm);
+ if (!pgtable)
+ return VM_FAULT_OOM;
+ }
+
ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
if (!pmd_none(*(vmf->pmd))) {
spin_unlock(ptl);
goto fallback;
}
+ if (pgtable) {
+ pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
+ mm_inc_nr_ptes(vma->vm_mm);
+ }
pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
pmd_entry = pmd_mkhuge(pmd_entry);
set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
@@ -1436,6 +1449,8 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
return VM_FAULT_NOPAGE;
fallback:
+ if (pgtable)
+ pte_free(vma->vm_mm, pgtable);
trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
return VM_FAULT_FALLBACK;
}
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 95b5e78c22b1..f25daa207421 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -163,19 +163,24 @@ static int debugfs_show_options(struct seq_file *m, struct dentry *root)
return 0;
}
-static void debugfs_evict_inode(struct inode *inode)
+static void debugfs_i_callback(struct rcu_head *head)
{
- truncate_inode_pages_final(&inode->i_data);
- clear_inode(inode);
+ struct inode *inode = container_of(head, struct inode, i_rcu);
if (S_ISLNK(inode->i_mode))
kfree(inode->i_link);
+ free_inode_nonrcu(inode);
+}
+
+static void debugfs_destroy_inode(struct inode *inode)
+{
+ call_rcu(&inode->i_rcu, debugfs_i_callback);
}
static const struct super_operations debugfs_super_operations = {
.statfs = simple_statfs,
.remount_fs = debugfs_remount,
.show_options = debugfs_show_options,
- .evict_inode = debugfs_evict_inode,
+ .destroy_inode = debugfs_destroy_inode,
};
static void debugfs_release_dentry(struct dentry *dentry)
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index a1ac7e9245ec..75a5309f2231 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -384,7 +384,7 @@ static inline void ext4_update_inode_fsync_trans(handle_t *handle,
{
struct ext4_inode_info *ei = EXT4_I(inode);
- if (ext4_handle_valid(handle)) {
+ if (ext4_handle_valid(handle) && !is_handle_aborted(handle)) {
ei->i_sync_tid = handle->h_transaction->t_tid;
if (datasync)
ei->i_datasync_tid = handle->h_transaction->t_tid;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 69d65d49837b..98ec11f69cd4 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -125,7 +125,7 @@ ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
struct super_block *sb = inode->i_sb;
int blockmask = sb->s_blocksize - 1;
- if (pos >= i_size_read(inode))
+ if (pos >= ALIGN(i_size_read(inode), sb->s_blocksize))
return 0;
if ((pos | iov_iter_alignment(from)) & blockmask)
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index c2225f0d31b5..2024d3fa5504 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -1222,6 +1222,7 @@ int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
ext4_lblk_t offsets[4], offsets2[4];
Indirect chain[4], chain2[4];
Indirect *partial, *partial2;
+ Indirect *p = NULL, *p2 = NULL;
ext4_lblk_t max_block;
__le32 nr = 0, nr2 = 0;
int n = 0, n2 = 0;
@@ -1263,7 +1264,7 @@ int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
}
- partial = ext4_find_shared(inode, n, offsets, chain, &nr);
+ partial = p = ext4_find_shared(inode, n, offsets, chain, &nr);
if (nr) {
if (partial == chain) {
/* Shared branch grows from the inode */
@@ -1288,13 +1289,11 @@ int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
partial->p + 1,
(__le32 *)partial->bh->b_data+addr_per_block,
(chain+n-1) - partial);
- BUFFER_TRACE(partial->bh, "call brelse");
- brelse(partial->bh);
partial--;
}
end_range:
- partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
+ partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
if (nr2) {
if (partial2 == chain2) {
/*
@@ -1324,16 +1323,14 @@ end_range:
(__le32 *)partial2->bh->b_data,
partial2->p,
(chain2+n2-1) - partial2);
- BUFFER_TRACE(partial2->bh, "call brelse");
- brelse(partial2->bh);
partial2--;
}
goto do_indirects;
}
/* Punch happened within the same level (n == n2) */
- partial = ext4_find_shared(inode, n, offsets, chain, &nr);
- partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
+ partial = p = ext4_find_shared(inode, n, offsets, chain, &nr);
+ partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
/* Free top, but only if partial2 isn't its subtree. */
if (nr) {
@@ -1390,11 +1387,7 @@ end_range:
partial->p + 1,
partial2->p,
(chain+n-1) - partial);
- BUFFER_TRACE(partial->bh, "call brelse");
- brelse(partial->bh);
- BUFFER_TRACE(partial2->bh, "call brelse");
- brelse(partial2->bh);
- return 0;
+ goto cleanup;
}
/*
@@ -1409,8 +1402,6 @@ end_range:
partial->p + 1,
(__le32 *)partial->bh->b_data+addr_per_block,
(chain+n-1) - partial);
- BUFFER_TRACE(partial->bh, "call brelse");
- brelse(partial->bh);
partial--;
}
if (partial2 > chain2 && depth2 <= depth) {
@@ -1418,11 +1409,21 @@ end_range:
(__le32 *)partial2->bh->b_data,
partial2->p,
(chain2+n2-1) - partial2);
- BUFFER_TRACE(partial2->bh, "call brelse");
- brelse(partial2->bh);
partial2--;
}
}
+
+cleanup:
+ while (p && p > chain) {
+ BUFFER_TRACE(p->bh, "call brelse");
+ brelse(p->bh);
+ p--;
+ }
+ while (p2 && p2 > chain2) {
+ BUFFER_TRACE(p2->bh, "call brelse");
+ brelse(p2->bh);
+ p2--;
+ }
return 0;
do_indirects:
@@ -1430,7 +1431,7 @@ do_indirects:
switch (offsets[0]) {
default:
if (++n >= n2)
- return 0;
+ break;
nr = i_data[EXT4_IND_BLOCK];
if (nr) {
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
@@ -1439,7 +1440,7 @@ do_indirects:
/* fall through */
case EXT4_IND_BLOCK:
if (++n >= n2)
- return 0;
+ break;
nr = i_data[EXT4_DIND_BLOCK];
if (nr) {
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
@@ -1448,7 +1449,7 @@ do_indirects:
/* fall through */
case EXT4_DIND_BLOCK:
if (++n >= n2)
- return 0;
+ break;
nr = i_data[EXT4_TIND_BLOCK];
if (nr) {
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
@@ -1458,5 +1459,5 @@ do_indirects:
case EXT4_TIND_BLOCK:
;
}
- return 0;
+ goto cleanup;
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b54b261ded36..b32a57bc5d5d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -6080,36 +6080,6 @@ out:
return;
}
-#if 0
-/*
- * Bind an inode's backing buffer_head into this transaction, to prevent
- * it from being flushed to disk early. Unlike
- * ext4_reserve_inode_write, this leaves behind no bh reference and
- * returns no iloc structure, so the caller needs to repeat the iloc
- * lookup to mark the inode dirty later.
- */
-static int ext4_pin_inode(handle_t *handle, struct inode *inode)
-{
- struct ext4_iloc iloc;
-
- int err = 0;
- if (handle) {
- err = ext4_get_inode_loc(inode, &iloc);
- if (!err) {
- BUFFER_TRACE(iloc.bh, "get_write_access");
- err = jbd2_journal_get_write_access(handle, iloc.bh);
- if (!err)
- err = ext4_handle_dirty_metadata(handle,
- NULL,
- iloc.bh);
- brelse(iloc.bh);
- }
- }
- ext4_std_error(inode->i_sb, err);
- return err;
-}
-#endif
-
int ext4_change_inode_journal_flag(struct inode *inode, int val)
{
journal_t *journal;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 3c4f8bb59f8a..bab3da4f1e0d 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -1000,6 +1000,13 @@ resizefs_out:
if (!blk_queue_discard(q))
return -EOPNOTSUPP;
+ /*
+ * We haven't replayed the journal, so we cannot use our
+ * block-bitmap-guided storage zapping commands.
+ */
+ if (test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb))
+ return -EROFS;
+
if (copy_from_user(&range, (struct fstrim_range __user *)arg,
sizeof(range)))
return -EFAULT;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 3d9b18505c0c..e7ae26e36c9c 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -932,11 +932,18 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
memcpy(n_group_desc, o_group_desc,
EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
n_group_desc[gdb_num] = gdb_bh;
+
+ BUFFER_TRACE(gdb_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, gdb_bh);
+ if (err) {
+ kvfree(n_group_desc);
+ brelse(gdb_bh);
+ return err;
+ }
+
EXT4_SB(sb)->s_group_desc = n_group_desc;
EXT4_SB(sb)->s_gdb_count++;
kvfree(o_group_desc);
- BUFFER_TRACE(gdb_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, gdb_bh);
return err;
}
@@ -2073,6 +2080,10 @@ out:
free_flex_gd(flex_gd);
if (resize_inode != NULL)
iput(resize_inode);
- ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", n_blocks_count);
+ if (err)
+ ext4_warning(sb, "error (%d) occurred during "
+ "file system resize", err);
+ ext4_msg(sb, KERN_INFO, "resized filesystem to %llu",
+ ext4_blocks_count(es));
return err;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f5b828bf1299..6ed4eb81e674 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -430,6 +430,12 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
spin_unlock(&sbi->s_md_lock);
}
+static bool system_going_down(void)
+{
+ return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF
+ || system_state == SYSTEM_RESTART;
+}
+
/* Deal with the reporting of failure conditions on a filesystem such as
* inconsistencies detected or read IO failures.
*
@@ -460,7 +466,12 @@ static void ext4_handle_error(struct super_block *sb)
if (journal)
jbd2_journal_abort(journal, -EIO);
}
- if (test_opt(sb, ERRORS_RO)) {
+ /*
+ * We force ERRORS_RO behavior when system is rebooting. Otherwise we
+ * could panic during 'reboot -f' as the underlying device got already
+ * disabled.
+ */
+ if (test_opt(sb, ERRORS_RO) || system_going_down()) {
ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
/*
* Make sure updated value of ->s_mount_flags will be visible
@@ -468,8 +479,7 @@ static void ext4_handle_error(struct super_block *sb)
*/
smp_wmb();
sb->s_flags |= SB_RDONLY;
- }
- if (test_opt(sb, ERRORS_PANIC)) {
+ } else if (test_opt(sb, ERRORS_PANIC)) {
if (EXT4_SB(sb)->s_journal &&
!(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
return;
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index f955cd3e0677..a98e1b02279e 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -306,8 +306,9 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
goto skip_write;
/* collect a number of dirty meta pages and write together */
- if (wbc->for_kupdate ||
- get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META))
+ if (wbc->sync_mode != WB_SYNC_ALL &&
+ get_pages(sbi, F2FS_DIRTY_META) <
+ nr_pages_to_skip(sbi, META))
goto skip_write;
/* if locked failed, cp will flush dirty pages instead */
@@ -405,7 +406,7 @@ static int f2fs_set_meta_page_dirty(struct page *page)
if (!PageDirty(page)) {
__set_page_dirty_nobuffers(page);
inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
- SetPagePrivate(page);
+ f2fs_set_page_private(page, 0);
f2fs_trace_pid(page);
return 1;
}
@@ -956,7 +957,7 @@ void f2fs_update_dirty_page(struct inode *inode, struct page *page)
inode_inc_dirty_pages(inode);
spin_unlock(&sbi->inode_lock[type]);
- SetPagePrivate(page);
+ f2fs_set_page_private(page, 0);
f2fs_trace_pid(page);
}
@@ -1259,10 +1260,17 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
else
__clear_ckpt_flags(ckpt, CP_DISABLED_FLAG);
+ if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK))
+ __set_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG);
+ else
+ __clear_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG);
+
if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH))
__set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG);
- else
- __clear_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG);
+ /*
+ * TODO: we count on fsck.f2fs to clear this flag until we figure out
+ * missing cases which clear it incorrectly.
+ */
if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR))
__set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 568e1d09eb48..9727944139f2 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -301,9 +301,10 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi,
for (; start < F2FS_IO_SIZE(sbi); start++) {
struct page *page =
mempool_alloc(sbi->write_io_dummy,
- GFP_NOIO | __GFP_ZERO | __GFP_NOFAIL);
+ GFP_NOIO | __GFP_NOFAIL);
f2fs_bug_on(sbi, !page);
+ zero_user_segment(page, 0, PAGE_SIZE);
SetPagePrivate(page);
set_page_private(page, (unsigned long)DUMMY_WRITTEN_PAGE);
lock_page(page);
@@ -1553,6 +1554,9 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
if (last_block > last_block_in_file)
last_block = last_block_in_file;
+ /* just zeroing out page which is beyond EOF */
+ if (block_in_file >= last_block)
+ goto zero_out;
/*
* Map blocks using the previous result first.
*/
@@ -1565,16 +1569,11 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
* Then do more f2fs_map_blocks() calls until we are
* done with this page.
*/
- map.m_flags = 0;
-
- if (block_in_file < last_block) {
- map.m_lblk = block_in_file;
- map.m_len = last_block - block_in_file;
+ map.m_lblk = block_in_file;
+ map.m_len = last_block - block_in_file;
- if (f2fs_map_blocks(inode, &map, 0,
- F2FS_GET_BLOCK_DEFAULT))
- goto set_error_page;
- }
+ if (f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT))
+ goto set_error_page;
got_it:
if ((map.m_flags & F2FS_MAP_MAPPED)) {
block_nr = map.m_pblk + block_in_file - map.m_lblk;
@@ -1589,6 +1588,7 @@ got_it:
DATA_GENERIC))
goto set_error_page;
} else {
+zero_out:
zero_user_segment(page, 0, PAGE_SIZE);
if (!PageUptodate(page))
SetPageUptodate(page);
@@ -1863,8 +1863,13 @@ got_it:
if (fio->need_lock == LOCK_REQ)
f2fs_unlock_op(fio->sbi);
err = f2fs_inplace_write_data(fio);
- if (err && PageWriteback(page))
- end_page_writeback(page);
+ if (err) {
+ if (f2fs_encrypted_file(inode))
+ fscrypt_pullback_bio_page(&fio->encrypted_page,
+ true);
+ if (PageWriteback(page))
+ end_page_writeback(page);
+ }
trace_f2fs_do_write_data_page(fio->page, IPU);
set_inode_flag(inode, FI_UPDATE_WRITE);
return err;
@@ -2315,7 +2320,8 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to)
down_write(&F2FS_I(inode)->i_mmap_sem);
truncate_pagecache(inode, i_size);
- f2fs_truncate_blocks(inode, i_size, true, true);
+ if (!IS_NOQUOTA(inode))
+ f2fs_truncate_blocks(inode, i_size, true);
up_write(&F2FS_I(inode)->i_mmap_sem);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
@@ -2585,14 +2591,11 @@ static void f2fs_dio_submit_bio(struct bio *bio, struct inode *inode,
{
struct f2fs_private_dio *dio;
bool write = (bio_op(bio) == REQ_OP_WRITE);
- int err;
dio = f2fs_kzalloc(F2FS_I_SB(inode),
sizeof(struct f2fs_private_dio), GFP_NOFS);
- if (!dio) {
- err = -ENOMEM;
+ if (!dio)
goto out;
- }
dio->inode = inode;
dio->orig_end_io = bio->bi_end_io;
@@ -2710,12 +2713,10 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
clear_cold_data(page);
- /* This is atomic written page, keep Private */
if (IS_ATOMIC_WRITTEN_PAGE(page))
return f2fs_drop_inmem_page(inode, page);
- set_page_private(page, 0);
- ClearPagePrivate(page);
+ f2fs_clear_page_private(page);
}
int f2fs_release_page(struct page *page, gfp_t wait)
@@ -2729,8 +2730,7 @@ int f2fs_release_page(struct page *page, gfp_t wait)
return 0;
clear_cold_data(page);
- set_page_private(page, 0);
- ClearPagePrivate(page);
+ f2fs_clear_page_private(page);
return 1;
}
@@ -2798,12 +2798,8 @@ int f2fs_migrate_page(struct address_space *mapping,
return -EAGAIN;
}
- /*
- * A reference is expected if PagePrivate set when move mapping,
- * however F2FS breaks this for maintaining dirty page counts when
- * truncating pages. So here adjusting the 'extra_count' make it work.
- */
- extra_count = (atomic_written ? 1 : 0) - page_has_private(page);
+ /* one extra reference was held for atomic_write page */
+ extra_count = atomic_written ? 1 : 0;
rc = migrate_page_move_mapping(mapping, newpage,
page, mode, extra_count);
if (rc != MIGRATEPAGE_SUCCESS) {
@@ -2824,9 +2820,10 @@ int f2fs_migrate_page(struct address_space *mapping,
get_page(newpage);
}
- if (PagePrivate(page))
- SetPagePrivate(newpage);
- set_page_private(newpage, page_private(page));
+ if (PagePrivate(page)) {
+ f2fs_set_page_private(newpage, page_private(page));
+ f2fs_clear_page_private(page);
+ }
if (mode != MIGRATE_SYNC_NO_COPY)
migrate_page_copy(newpage, page);
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index fd7f170e2f2d..99e9a5c37b71 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -96,8 +96,10 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->free_secs = free_sections(sbi);
si->prefree_count = prefree_segments(sbi);
si->dirty_count = dirty_segments(sbi);
- si->node_pages = NODE_MAPPING(sbi)->nrpages;
- si->meta_pages = META_MAPPING(sbi)->nrpages;
+ if (sbi->node_inode)
+ si->node_pages = NODE_MAPPING(sbi)->nrpages;
+ if (sbi->meta_inode)
+ si->meta_pages = META_MAPPING(sbi)->nrpages;
si->nats = NM_I(sbi)->nat_cnt;
si->dirty_nats = NM_I(sbi)->dirty_nat_cnt;
si->sits = MAIN_SEGS(sbi);
@@ -175,7 +177,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
static void update_mem_info(struct f2fs_sb_info *sbi)
{
struct f2fs_stat_info *si = F2FS_STAT(sbi);
- unsigned npages;
int i;
if (si->base_mem)
@@ -258,10 +259,14 @@ get_cache:
sizeof(struct extent_node);
si->page_mem = 0;
- npages = NODE_MAPPING(sbi)->nrpages;
- si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
- npages = META_MAPPING(sbi)->nrpages;
- si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
+ if (sbi->node_inode) {
+ unsigned npages = NODE_MAPPING(sbi)->nrpages;
+ si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
+ }
+ if (sbi->meta_inode) {
+ unsigned npages = META_MAPPING(sbi)->nrpages;
+ si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
+ }
}
static int stat_show(struct seq_file *s, void *v)
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 713b36a10a79..59bc46017855 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -728,7 +728,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
!f2fs_truncate_hole(dir, page->index, page->index + 1)) {
f2fs_clear_page_cache_dirty_tag(page);
clear_page_dirty_for_io(page);
- ClearPagePrivate(page);
+ f2fs_clear_page_private(page);
ClearPageUptodate(page);
clear_cold_data(page);
inode_dec_dirty_pages(dir);
@@ -800,6 +800,10 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
if (de->name_len == 0) {
bit_pos++;
ctx->pos = start_pos + bit_pos;
+ printk_ratelimited(
+ "%s, invalid namelen(0), ino:%u, run fsck to fix.",
+ KERN_WARNING, le32_to_cpu(de->ino));
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
continue;
}
@@ -810,7 +814,8 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
/* check memory boundary before moving forward */
bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
- if (unlikely(bit_pos > d->max)) {
+ if (unlikely(bit_pos > d->max ||
+ le16_to_cpu(de->name_len) > F2FS_NAME_LEN)) {
f2fs_msg(sbi->sb, KERN_WARNING,
"%s: corrupted namelen=%d, run fsck to fix.",
__func__, le16_to_cpu(de->name_len));
@@ -891,7 +896,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
page_cache_sync_readahead(inode->i_mapping, ra, file, n,
min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES));
- dentry_page = f2fs_get_lock_data_page(inode, n, false);
+ dentry_page = f2fs_find_data_page(inode, n);
if (IS_ERR(dentry_page)) {
err = PTR_ERR(dentry_page);
if (err == -ENOENT) {
@@ -909,11 +914,11 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
err = f2fs_fill_dentries(ctx, &d,
n * NR_DENTRY_IN_BLOCK, &fstr);
if (err) {
- f2fs_put_page(dentry_page, 1);
+ f2fs_put_page(dentry_page, 0);
break;
}
- f2fs_put_page(dentry_page, 1);
+ f2fs_put_page(dentry_page, 0);
}
out_free:
fscrypt_fname_free_buffer(&fstr);
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 1cb0fcc67d2d..caf77fe8ac07 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -506,7 +506,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
unsigned int end = fofs + len;
unsigned int pos = (unsigned int)fofs;
bool updated = false;
- bool leftmost;
+ bool leftmost = false;
if (!et)
return;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 7ea5c9cede37..87f75ebd2fd6 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -190,6 +190,8 @@ enum {
#define DEF_CP_INTERVAL 60 /* 60 secs */
#define DEF_IDLE_INTERVAL 5 /* 5 secs */
#define DEF_DISABLE_INTERVAL 5 /* 5 secs */
+#define DEF_DISABLE_QUICK_INTERVAL 1 /* 1 secs */
+#define DEF_UMOUNT_DISCARD_TIMEOUT 5 /* 5 secs */
struct cp_control {
int reason;
@@ -253,7 +255,7 @@ struct discard_entry {
/* max discard pend list number */
#define MAX_PLIST_NUM 512
#define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \
- (MAX_PLIST_NUM - 1) : (blk_num - 1))
+ (MAX_PLIST_NUM - 1) : ((blk_num) - 1))
enum {
D_PREP, /* initial */
@@ -309,6 +311,7 @@ struct discard_policy {
bool sync; /* submit discard with REQ_SYNC flag */
bool ordered; /* issue discard by lba order */
unsigned int granularity; /* discard granularity */
+ int timeout; /* discard timeout for put_super */
};
struct discard_cmd_control {
@@ -455,7 +458,6 @@ struct f2fs_flush_device {
/* for inline stuff */
#define DEF_INLINE_RESERVED_SIZE 1
-#define DEF_MIN_INLINE_SIZE 1
static inline int get_extra_isize(struct inode *inode);
static inline int get_inline_xattr_addrs(struct inode *inode);
#define MAX_INLINE_DATA(inode) (sizeof(__le32) * \
@@ -1098,6 +1100,7 @@ enum {
SBI_IS_SHUTDOWN, /* shutdown by ioctl */
SBI_IS_RECOVERED, /* recovered orphan/data */
SBI_CP_DISABLED, /* CP was disabled last mount */
+ SBI_CP_DISABLED_QUICK, /* CP was disabled quickly */
SBI_QUOTA_NEED_FLUSH, /* need to flush quota info in CP */
SBI_QUOTA_SKIP_FLUSH, /* skip flushing quota in current CP */
SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */
@@ -1109,6 +1112,7 @@ enum {
DISCARD_TIME,
GC_TIME,
DISABLE_TIME,
+ UMOUNT_DISCARD_TIMEOUT,
MAX_TIME,
};
@@ -1237,8 +1241,6 @@ struct f2fs_sb_info {
unsigned int nquota_files; /* # of quota sysfile */
- u32 s_next_generation; /* for NFS support */
-
/* # of pages, see count_type */
atomic_t nr_pages[NR_COUNT_TYPE];
/* # of allocated blocks */
@@ -1798,13 +1800,12 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
{
atomic_inc(&sbi->nr_pages[count_type]);
- if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES ||
- count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA ||
- count_type == F2FS_RD_DATA || count_type == F2FS_RD_NODE ||
- count_type == F2FS_RD_META)
- return;
-
- set_sbi_flag(sbi, SBI_IS_DIRTY);
+ if (count_type == F2FS_DIRTY_DENTS ||
+ count_type == F2FS_DIRTY_NODES ||
+ count_type == F2FS_DIRTY_META ||
+ count_type == F2FS_DIRTY_QDATA ||
+ count_type == F2FS_DIRTY_IMETA)
+ set_sbi_flag(sbi, SBI_IS_DIRTY);
}
static inline void inode_inc_dirty_pages(struct inode *inode)
@@ -2156,10 +2157,17 @@ static inline bool is_idle(struct f2fs_sb_info *sbi, int type)
get_pages(sbi, F2FS_RD_META) || get_pages(sbi, F2FS_WB_DATA) ||
get_pages(sbi, F2FS_WB_CP_DATA) ||
get_pages(sbi, F2FS_DIO_READ) ||
- get_pages(sbi, F2FS_DIO_WRITE) ||
- atomic_read(&SM_I(sbi)->dcc_info->queued_discard) ||
- atomic_read(&SM_I(sbi)->fcc_info->queued_flush))
+ get_pages(sbi, F2FS_DIO_WRITE))
return false;
+
+ if (SM_I(sbi) && SM_I(sbi)->dcc_info &&
+ atomic_read(&SM_I(sbi)->dcc_info->queued_discard))
+ return false;
+
+ if (SM_I(sbi) && SM_I(sbi)->fcc_info &&
+ atomic_read(&SM_I(sbi)->fcc_info->queued_flush))
+ return false;
+
return f2fs_time_over(sbi, type);
}
@@ -2300,11 +2308,12 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr)
#define F2FS_EXTENTS_FL 0x00080000 /* Inode uses extents */
#define F2FS_EA_INODE_FL 0x00200000 /* Inode used for large EA */
#define F2FS_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */
+#define F2FS_NOCOW_FL 0x00800000 /* Do not cow file */
#define F2FS_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */
#define F2FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */
#define F2FS_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
-#define F2FS_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */
+#define F2FS_FL_USER_VISIBLE 0x30CBDFFF /* User visible flags */
#define F2FS_FL_USER_MODIFIABLE 0x204BC0FF /* User modifiable flags */
/* Flags we can manipulate with through F2FS_IOC_FSSETXATTR */
@@ -2761,9 +2770,9 @@ static inline int get_inline_xattr_addrs(struct inode *inode)
#define F2FS_OLD_ATTRIBUTE_SIZE (offsetof(struct f2fs_inode, i_addr))
#define F2FS_FITS_IN_INODE(f2fs_inode, extra_isize, field) \
- ((offsetof(typeof(*f2fs_inode), field) + \
+ ((offsetof(typeof(*(f2fs_inode)), field) + \
sizeof((f2fs_inode)->field)) \
- <= (F2FS_OLD_ATTRIBUTE_SIZE + extra_isize)) \
+ <= (F2FS_OLD_ATTRIBUTE_SIZE + (extra_isize))) \
static inline void f2fs_reset_iostat(struct f2fs_sb_info *sbi)
{
@@ -2792,8 +2801,8 @@ static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi,
#define __is_large_section(sbi) ((sbi)->segs_per_sec > 1)
-#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO(fio->type) == META && \
- (!is_read_io(fio->op) || fio->is_meta))
+#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO((fio)->type) == META && \
+ (!is_read_io((fio)->op) || (fio)->is_meta))
bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
block_t blkaddr, int type);
@@ -2825,13 +2834,33 @@ static inline bool is_valid_data_blkaddr(struct f2fs_sb_info *sbi,
return true;
}
+static inline void f2fs_set_page_private(struct page *page,
+ unsigned long data)
+{
+ if (PagePrivate(page))
+ return;
+
+ get_page(page);
+ SetPagePrivate(page);
+ set_page_private(page, data);
+}
+
+static inline void f2fs_clear_page_private(struct page *page)
+{
+ if (!PagePrivate(page))
+ return;
+
+ set_page_private(page, 0);
+ ClearPagePrivate(page);
+ f2fs_put_page(page, 0);
+}
+
/*
* file.c
*/
int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
void f2fs_truncate_data_blocks(struct dnode_of_data *dn);
-int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock,
- bool buf_write);
+int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock);
int f2fs_truncate(struct inode *inode);
int f2fs_getattr(const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int flags);
@@ -3005,7 +3034,7 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr);
bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr);
void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi);
void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi);
-bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi);
+bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi);
void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
struct cp_control *cpc);
void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi);
@@ -3610,8 +3639,6 @@ extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,
#define f2fs_build_fault_attr(sbi, rate, type) do { } while (0)
#endif
-#endif
-
static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
{
#ifdef CONFIG_QUOTA
@@ -3624,3 +3651,5 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
#endif
return false;
}
+
+#endif
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index ba5954f41e14..5742ab8b57dc 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -589,8 +589,7 @@ truncate_out:
return 0;
}
-int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock,
- bool buf_write)
+int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct dnode_of_data dn;
@@ -598,7 +597,6 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock,
int count = 0, err = 0;
struct page *ipage;
bool truncate_page = false;
- int flag = buf_write ? F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO;
trace_f2fs_truncate_blocks_enter(inode, from);
@@ -608,7 +606,7 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock,
goto free_partial;
if (lock)
- __do_map_lock(sbi, flag, true);
+ f2fs_lock_op(sbi);
ipage = f2fs_get_node_page(sbi, inode->i_ino);
if (IS_ERR(ipage)) {
@@ -646,7 +644,7 @@ free_next:
err = f2fs_truncate_inode_blocks(inode, free_from);
out:
if (lock)
- __do_map_lock(sbi, flag, false);
+ f2fs_unlock_op(sbi);
free_partial:
/* lastly zero out the first data page */
if (!err)
@@ -681,7 +679,7 @@ int f2fs_truncate(struct inode *inode)
return err;
}
- err = f2fs_truncate_blocks(inode, i_size_read(inode), true, false);
+ err = f2fs_truncate_blocks(inode, i_size_read(inode), true);
if (err)
return err;
@@ -768,7 +766,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
int err;
- bool size_changed = false;
if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
return -EIO;
@@ -843,8 +840,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
down_write(&F2FS_I(inode)->i_sem);
F2FS_I(inode)->last_disk_size = i_size_read(inode);
up_write(&F2FS_I(inode)->i_sem);
-
- size_changed = true;
}
__setattr_copy(inode, attr);
@@ -858,7 +853,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
}
/* file size may changed here */
- f2fs_mark_inode_dirty_sync(inode, size_changed);
+ f2fs_mark_inode_dirty_sync(inode, true);
/* inode change will produce dirty node pages flushed by checkpoint */
f2fs_balance_fs(F2FS_I_SB(inode), true);
@@ -1262,7 +1257,7 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
new_size = i_size_read(inode) - len;
truncate_pagecache(inode, new_size);
- ret = f2fs_truncate_blocks(inode, new_size, true, false);
+ ret = f2fs_truncate_blocks(inode, new_size, true);
up_write(&F2FS_I(inode)->i_mmap_sem);
if (!ret)
f2fs_i_size_write(inode, new_size);
@@ -1447,7 +1442,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
f2fs_balance_fs(sbi, true);
down_write(&F2FS_I(inode)->i_mmap_sem);
- ret = f2fs_truncate_blocks(inode, i_size_read(inode), true, false);
+ ret = f2fs_truncate_blocks(inode, i_size_read(inode), true);
up_write(&F2FS_I(inode)->i_mmap_sem);
if (ret)
return ret;
@@ -1651,6 +1646,8 @@ static int f2fs_ioc_getflags(struct file *filp, unsigned long arg)
flags |= F2FS_ENCRYPT_FL;
if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode))
flags |= F2FS_INLINE_DATA_FL;
+ if (is_inode_flag_set(inode, FI_PIN_FILE))
+ flags |= F2FS_NOCOW_FL;
flags &= F2FS_FL_USER_VISIBLE;
@@ -1750,10 +1747,12 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- if (!get_dirty_pages(inode))
- goto skip_flush;
-
- f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING,
+ /*
+ * Should wait end_io to count F2FS_WB_CP_DATA correctly by
+ * f2fs_is_atomic_file.
+ */
+ if (get_dirty_pages(inode))
+ f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING,
"Unexpected flush for atomic writes: ino=%lu, npages=%u",
inode->i_ino, get_dirty_pages(inode));
ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
@@ -1761,7 +1760,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
goto out;
}
-skip_flush:
+
set_inode_flag(inode, FI_ATOMIC_FILE);
clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
@@ -1968,11 +1967,11 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
break;
case F2FS_GOING_DOWN_NEED_FSCK:
set_sbi_flag(sbi, SBI_NEED_FSCK);
+ set_sbi_flag(sbi, SBI_CP_DISABLED_QUICK);
+ set_sbi_flag(sbi, SBI_IS_DIRTY);
/* do checkpoint only */
ret = f2fs_sync_fs(sb, 1);
- if (ret)
- goto out;
- break;
+ goto out;
default:
ret = -EINVAL;
goto out;
@@ -1988,6 +1987,9 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
out:
if (in != F2FS_GOING_DOWN_FULLSYNC)
mnt_drop_write_file(filp);
+
+ trace_f2fs_shutdown(sbi, in, ret);
+
return ret;
}
@@ -2871,8 +2873,8 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg)
__u32 pin;
int ret = 0;
- if (!inode_owner_or_capable(inode))
- return -EACCES;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
if (get_user(pin, (__u32 __user *)arg))
return -EFAULT;
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index d636cbcf68f2..bb6a152310ef 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -298,7 +298,7 @@ process_inline:
clear_inode_flag(inode, FI_INLINE_DATA);
f2fs_put_page(ipage, 1);
} else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
- if (f2fs_truncate_blocks(inode, 0, false, false))
+ if (f2fs_truncate_blocks(inode, 0, false))
return false;
goto process_inline;
}
@@ -470,7 +470,7 @@ static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry)
return 0;
punch_dentry_pages:
truncate_inode_pages(&dir->i_data, 0);
- f2fs_truncate_blocks(dir, 0, false, false);
+ f2fs_truncate_blocks(dir, 0, false);
f2fs_remove_dirty_inode(dir);
return err;
}
@@ -659,6 +659,12 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
if (IS_ERR(ipage))
return PTR_ERR(ipage);
+ /*
+ * f2fs_readdir was protected by inode.i_rwsem, it is safe to access
+ * ipage without page's lock held.
+ */
+ unlock_page(ipage);
+
inline_dentry = inline_data_addr(inode, ipage);
make_dentry_ptr_inline(inode, &d, inline_dentry);
@@ -667,7 +673,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
if (!err)
ctx->pos = d.max;
- f2fs_put_page(ipage, 1);
+ f2fs_put_page(ipage, 0);
return err < 0 ? err : 0;
}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index d910a820ae67..e7f2e8759315 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -14,6 +14,7 @@
#include "f2fs.h"
#include "node.h"
#include "segment.h"
+#include "xattr.h"
#include <trace/events/f2fs.h>
@@ -248,6 +249,20 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
return false;
}
+ if (f2fs_has_extra_attr(inode) &&
+ f2fs_sb_has_flexible_inline_xattr(sbi) &&
+ f2fs_has_inline_xattr(inode) &&
+ (!fi->i_inline_xattr_size ||
+ fi->i_inline_xattr_size > MAX_INLINE_XATTR_SIZE)) {
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "%s: inode (ino=%lx) has corrupted "
+ "i_inline_xattr_size: %d, max: %zu",
+ __func__, inode->i_ino, fi->i_inline_xattr_size,
+ MAX_INLINE_XATTR_SIZE);
+ return false;
+ }
+
if (F2FS_I(inode)->extent_tree) {
struct extent_info *ei = &F2FS_I(inode)->extent_tree->largest;
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index e967d27c1a89..f5e34e467003 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -10,6 +10,7 @@
#include <linux/pagemap.h>
#include <linux/sched.h>
#include <linux/ctype.h>
+#include <linux/random.h>
#include <linux/dcache.h>
#include <linux/namei.h>
#include <linux/quotaops.h>
@@ -50,7 +51,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
inode->i_blocks = 0;
inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
F2FS_I(inode)->i_crtime = inode->i_mtime;
- inode->i_generation = sbi->s_next_generation++;
+ inode->i_generation = prandom_u32();
if (S_ISDIR(inode->i_mode))
F2FS_I(inode)->i_current_depth = 1;
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 4f450e573312..3f99ab288695 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1920,7 +1920,9 @@ static int f2fs_write_node_pages(struct address_space *mapping,
f2fs_balance_fs_bg(sbi);
/* collect a number of dirty node pages and write together */
- if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE))
+ if (wbc->sync_mode != WB_SYNC_ALL &&
+ get_pages(sbi, F2FS_DIRTY_NODES) <
+ nr_pages_to_skip(sbi, NODE))
goto skip_write;
if (wbc->sync_mode == WB_SYNC_ALL)
@@ -1959,7 +1961,7 @@ static int f2fs_set_node_page_dirty(struct page *page)
if (!PageDirty(page)) {
__set_page_dirty_nobuffers(page);
inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
- SetPagePrivate(page);
+ f2fs_set_page_private(page, 0);
f2fs_trace_pid(page);
return 1;
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 9b79056d705d..aa7fe79b62b2 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -191,8 +191,7 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page)
f2fs_trace_pid(page);
- set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE);
- SetPagePrivate(page);
+ f2fs_set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE);
new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS);
@@ -215,7 +214,8 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page)
}
static int __revoke_inmem_pages(struct inode *inode,
- struct list_head *head, bool drop, bool recover)
+ struct list_head *head, bool drop, bool recover,
+ bool trylock)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct inmem_pages *cur, *tmp;
@@ -227,7 +227,16 @@ static int __revoke_inmem_pages(struct inode *inode,
if (drop)
trace_f2fs_commit_inmem_page(page, INMEM_DROP);
- lock_page(page);
+ if (trylock) {
+ /*
+ * to avoid deadlock in between page lock and
+ * inmem_lock.
+ */
+ if (!trylock_page(page))
+ continue;
+ } else {
+ lock_page(page);
+ }
f2fs_wait_on_page_writeback(page, DATA, true, true);
@@ -270,8 +279,7 @@ next:
ClearPageUptodate(page);
clear_cold_data(page);
}
- set_page_private(page, 0);
- ClearPagePrivate(page);
+ f2fs_clear_page_private(page);
f2fs_put_page(page, 1);
list_del(&cur->list);
@@ -318,13 +326,19 @@ void f2fs_drop_inmem_pages(struct inode *inode)
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode_info *fi = F2FS_I(inode);
- mutex_lock(&fi->inmem_lock);
- __revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
- spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
- if (!list_empty(&fi->inmem_ilist))
- list_del_init(&fi->inmem_ilist);
- spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
- mutex_unlock(&fi->inmem_lock);
+ while (!list_empty(&fi->inmem_pages)) {
+ mutex_lock(&fi->inmem_lock);
+ __revoke_inmem_pages(inode, &fi->inmem_pages,
+ true, false, true);
+
+ if (list_empty(&fi->inmem_pages)) {
+ spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+ if (!list_empty(&fi->inmem_ilist))
+ list_del_init(&fi->inmem_ilist);
+ spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
+ }
+ mutex_unlock(&fi->inmem_lock);
+ }
clear_inode_flag(inode, FI_ATOMIC_FILE);
fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0;
@@ -354,8 +368,7 @@ void f2fs_drop_inmem_page(struct inode *inode, struct page *page)
kmem_cache_free(inmem_entry_slab, cur);
ClearPageUptodate(page);
- set_page_private(page, 0);
- ClearPagePrivate(page);
+ f2fs_clear_page_private(page);
f2fs_put_page(page, 0);
trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE);
@@ -429,12 +442,15 @@ retry:
* recovery or rewrite & commit last transaction. For other
* error number, revoking was done by filesystem itself.
*/
- err = __revoke_inmem_pages(inode, &revoke_list, false, true);
+ err = __revoke_inmem_pages(inode, &revoke_list,
+ false, true, false);
/* drop all uncommitted pages */
- __revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
+ __revoke_inmem_pages(inode, &fi->inmem_pages,
+ true, false, false);
} else {
- __revoke_inmem_pages(inode, &revoke_list, false, false);
+ __revoke_inmem_pages(inode, &revoke_list,
+ false, false, false);
}
return err;
@@ -542,9 +558,13 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
static int __submit_flush_wait(struct f2fs_sb_info *sbi,
struct block_device *bdev)
{
- struct bio *bio = f2fs_bio_alloc(sbi, 0, true);
+ struct bio *bio;
int ret;
+ bio = f2fs_bio_alloc(sbi, 0, false);
+ if (!bio)
+ return -ENOMEM;
+
bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
bio_set_dev(bio, bdev);
ret = submit_bio_wait(bio);
@@ -868,6 +888,9 @@ int f2fs_disable_cp_again(struct f2fs_sb_info *sbi)
if (holes[DATA] > ovp || holes[NODE] > ovp)
return -EAGAIN;
+ if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK) &&
+ dirty_segments(sbi) > overprovision_segments(sbi))
+ return -EAGAIN;
return 0;
}
@@ -1037,6 +1060,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
dpolicy->io_aware_gran = MAX_PLIST_NUM;
+ dpolicy->timeout = 0;
if (discard_type == DPOLICY_BG) {
dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
@@ -1059,6 +1083,8 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
} else if (discard_type == DPOLICY_UMOUNT) {
dpolicy->max_requests = UINT_MAX;
dpolicy->io_aware = false;
+ /* we need to issue all to keep CP_TRIMMED_FLAG */
+ dpolicy->granularity = 1;
}
}
@@ -1424,7 +1450,14 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi,
int i, issued = 0;
bool io_interrupted = false;
+ if (dpolicy->timeout != 0)
+ f2fs_update_time(sbi, dpolicy->timeout);
+
for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
+ if (dpolicy->timeout != 0 &&
+ f2fs_time_over(sbi, dpolicy->timeout))
+ break;
+
if (i + 1 < dpolicy->granularity)
break;
@@ -1611,7 +1644,7 @@ void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi)
}
/* This comes from f2fs_put_super */
-bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi)
+bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi)
{
struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
struct discard_policy dpolicy;
@@ -1619,6 +1652,7 @@ bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi)
__init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT,
dcc->discard_granularity);
+ dpolicy.timeout = UMOUNT_DISCARD_TIMEOUT;
__issue_discard_cmd(sbi, &dpolicy);
dropped = __drop_discard_cmd(sbi);
@@ -3164,10 +3198,10 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio)
stat_inc_inplace_blocks(fio->sbi);
err = f2fs_submit_page_bio(fio);
- if (!err)
+ if (!err) {
update_device_state(fio);
-
- f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE);
+ f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE);
+ }
return err;
}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index a77f76f528b6..5c7ed0442d6e 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -865,7 +865,7 @@ static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force)
}
}
mutex_unlock(&dcc->cmd_lock);
- if (!wakeup)
+ if (!wakeup || !is_idle(sbi, DISCARD_TIME))
return;
wake_up:
dcc->discard_wake = 1;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index d1ccc52afc93..f2aaa2cc6b3e 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -269,7 +269,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype,
if (!qname) {
f2fs_msg(sb, KERN_ERR,
"Not enough memory for storing quotafile name");
- return -EINVAL;
+ return -ENOMEM;
}
if (F2FS_OPTION(sbi).s_qf_names[qtype]) {
if (strcmp(F2FS_OPTION(sbi).s_qf_names[qtype], qname) == 0)
@@ -586,7 +586,7 @@ static int parse_options(struct super_block *sb, char *options)
case Opt_io_size_bits:
if (args->from && match_int(args, &arg))
return -EINVAL;
- if (arg > __ilog2_u32(BIO_MAX_PAGES)) {
+ if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_PAGES)) {
f2fs_msg(sb, KERN_WARNING,
"Not support %d, larger than %d",
1 << arg, BIO_MAX_PAGES);
@@ -821,6 +821,8 @@ static int parse_options(struct super_block *sb, char *options)
}
if (test_opt(sbi, INLINE_XATTR_SIZE)) {
+ int min_size, max_size;
+
if (!f2fs_sb_has_extra_attr(sbi) ||
!f2fs_sb_has_flexible_inline_xattr(sbi)) {
f2fs_msg(sb, KERN_ERR,
@@ -834,14 +836,15 @@ static int parse_options(struct super_block *sb, char *options)
"set with inline_xattr option");
return -EINVAL;
}
- if (!F2FS_OPTION(sbi).inline_xattr_size ||
- F2FS_OPTION(sbi).inline_xattr_size >=
- DEF_ADDRS_PER_INODE -
- F2FS_TOTAL_EXTRA_ATTR_SIZE -
- DEF_INLINE_RESERVED_SIZE -
- DEF_MIN_INLINE_SIZE) {
+
+ min_size = sizeof(struct f2fs_xattr_header) / sizeof(__le32);
+ max_size = MAX_INLINE_XATTR_SIZE;
+
+ if (F2FS_OPTION(sbi).inline_xattr_size < min_size ||
+ F2FS_OPTION(sbi).inline_xattr_size > max_size) {
f2fs_msg(sb, KERN_ERR,
- "inline xattr size is out of range");
+ "inline xattr size is out of range: %d ~ %d",
+ min_size, max_size);
return -EINVAL;
}
}
@@ -915,6 +918,10 @@ static int f2fs_drop_inode(struct inode *inode)
sb_start_intwrite(inode->i_sb);
f2fs_i_size_write(inode, 0);
+ f2fs_submit_merged_write_cond(F2FS_I_SB(inode),
+ inode, NULL, 0, DATA);
+ truncate_inode_pages_final(inode->i_mapping);
+
if (F2FS_HAS_BLOCKS(inode))
f2fs_truncate(inode);
@@ -1048,7 +1055,7 @@ static void f2fs_put_super(struct super_block *sb)
}
/* be sure to wait for any on-going discard commands */
- dropped = f2fs_wait_discard_bios(sbi);
+ dropped = f2fs_issue_discard_timeout(sbi);
if ((f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) &&
!sbi->discard_blks && !dropped) {
@@ -1075,7 +1082,10 @@ static void f2fs_put_super(struct super_block *sb)
f2fs_bug_on(sbi, sbi->fsync_node_num);
iput(sbi->node_inode);
+ sbi->node_inode = NULL;
+
iput(sbi->meta_inode);
+ sbi->meta_inode = NULL;
/*
* iput() can update stat information, if f2fs_write_checkpoint()
@@ -1455,9 +1465,16 @@ static int f2fs_enable_quotas(struct super_block *sb);
static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
{
+ unsigned int s_flags = sbi->sb->s_flags;
struct cp_control cpc;
- int err;
+ int err = 0;
+ int ret;
+ if (s_flags & SB_RDONLY) {
+ f2fs_msg(sbi->sb, KERN_ERR,
+ "checkpoint=disable on readonly fs");
+ return -EINVAL;
+ }
sbi->sb->s_flags |= SB_ACTIVE;
f2fs_update_time(sbi, DISABLE_TIME);
@@ -1465,18 +1482,24 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
while (!f2fs_time_over(sbi, DISABLE_TIME)) {
mutex_lock(&sbi->gc_mutex);
err = f2fs_gc(sbi, true, false, NULL_SEGNO);
- if (err == -ENODATA)
+ if (err == -ENODATA) {
+ err = 0;
break;
+ }
if (err && err != -EAGAIN)
- return err;
+ break;
}
- err = sync_filesystem(sbi->sb);
- if (err)
- return err;
+ ret = sync_filesystem(sbi->sb);
+ if (ret || err) {
+ err = ret ? ret: err;
+ goto restore_flag;
+ }
- if (f2fs_disable_cp_again(sbi))
- return -EAGAIN;
+ if (f2fs_disable_cp_again(sbi)) {
+ err = -EAGAIN;
+ goto restore_flag;
+ }
mutex_lock(&sbi->gc_mutex);
cpc.reason = CP_PAUSE;
@@ -1485,7 +1508,9 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
sbi->unusable_block_count = 0;
mutex_unlock(&sbi->gc_mutex);
- return 0;
+restore_flag:
+ sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+ return err;
}
static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi)
@@ -2023,6 +2048,12 @@ void f2fs_quota_off_umount(struct super_block *sb)
set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
}
}
+ /*
+ * In case of checkpoint=disable, we must flush quota blocks.
+ * This can cause NULL exception for node_inode in end_io, since
+ * put_super already dropped it.
+ */
+ sync_filesystem(sb);
}
static void f2fs_truncate_quota_inode_pages(struct super_block *sb)
@@ -2703,6 +2734,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
sbi->interval_time[DISCARD_TIME] = DEF_IDLE_INTERVAL;
sbi->interval_time[GC_TIME] = DEF_IDLE_INTERVAL;
sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_INTERVAL;
+ sbi->interval_time[UMOUNT_DISCARD_TIMEOUT] =
+ DEF_UMOUNT_DISCARD_TIMEOUT;
clear_sbi_flag(sbi, SBI_NEED_FSCK);
for (i = 0; i < NR_COUNT_TYPE; i++)
@@ -3022,10 +3055,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
struct f2fs_super_block *raw_super;
struct inode *root;
int err;
- bool retry = true, need_fsck = false;
+ bool skip_recovery = false, need_fsck = false;
char *options = NULL;
int recovery, i, valid_super_block;
struct curseg_info *seg_i;
+ int retry_cnt = 1;
try_onemore:
err = -EINVAL;
@@ -3097,7 +3131,6 @@ try_onemore:
sb->s_maxbytes = sbi->max_file_blocks <<
le32_to_cpu(raw_super->log_blocksize);
sb->s_max_links = F2FS_LINK_MAX;
- get_random_bytes(&sbi->s_next_generation, sizeof(u32));
#ifdef CONFIG_QUOTA
sb->dq_op = &f2fs_quota_operations;
@@ -3200,6 +3233,10 @@ try_onemore:
if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_QUOTA_NEED_FSCK_FLAG))
set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
+ if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_DISABLED_QUICK_FLAG)) {
+ set_sbi_flag(sbi, SBI_CP_DISABLED_QUICK);
+ sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_QUICK_INTERVAL;
+ }
/* Initialize device list */
err = f2fs_scan_devices(sbi);
@@ -3288,7 +3325,7 @@ try_onemore:
sb->s_root = d_make_root(root); /* allocate root dentry */
if (!sb->s_root) {
err = -ENOMEM;
- goto free_root_inode;
+ goto free_node_inode;
}
err = f2fs_register_sysfs(sbi);
@@ -3310,7 +3347,7 @@ try_onemore:
goto free_meta;
if (unlikely(is_set_ckpt_flags(sbi, CP_DISABLED_FLAG)))
- goto skip_recovery;
+ goto reset_checkpoint;
/* recover fsynced data */
if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
@@ -3327,11 +3364,13 @@ try_onemore:
if (need_fsck)
set_sbi_flag(sbi, SBI_NEED_FSCK);
- if (!retry)
- goto skip_recovery;
+ if (skip_recovery)
+ goto reset_checkpoint;
err = f2fs_recover_fsync_data(sbi, false);
if (err < 0) {
+ if (err != -ENOMEM)
+ skip_recovery = true;
need_fsck = true;
f2fs_msg(sb, KERN_ERR,
"Cannot recover all fsync data errno=%d", err);
@@ -3347,14 +3386,14 @@ try_onemore:
goto free_meta;
}
}
-skip_recovery:
+reset_checkpoint:
/* f2fs_recover_fsync_data() cleared this already */
clear_sbi_flag(sbi, SBI_POR_DOING);
if (test_opt(sbi, DISABLE_CHECKPOINT)) {
err = f2fs_disable_checkpoint(sbi);
if (err)
- goto free_meta;
+ goto sync_free_meta;
} else if (is_set_ckpt_flags(sbi, CP_DISABLED_FLAG)) {
f2fs_enable_checkpoint(sbi);
}
@@ -3367,7 +3406,7 @@ skip_recovery:
/* After POR, we can run background GC thread.*/
err = f2fs_start_gc_thread(sbi);
if (err)
- goto free_meta;
+ goto sync_free_meta;
}
kvfree(options);
@@ -3387,8 +3426,14 @@ skip_recovery:
cur_cp_version(F2FS_CKPT(sbi)));
f2fs_update_time(sbi, CP_TIME);
f2fs_update_time(sbi, REQ_TIME);
+ clear_sbi_flag(sbi, SBI_CP_DISABLED_QUICK);
return 0;
+sync_free_meta:
+ /* safe to flush all the data */
+ sync_filesystem(sbi->sb);
+ retry_cnt = 0;
+
free_meta:
#ifdef CONFIG_QUOTA
f2fs_truncate_quota_inode_pages(sb);
@@ -3402,6 +3447,8 @@ free_meta:
* falls into an infinite loop in f2fs_sync_meta_pages().
*/
truncate_inode_pages_final(META_MAPPING(sbi));
+ /* evict some inodes being cached by GC */
+ evict_inodes(sb);
f2fs_unregister_sysfs(sbi);
free_root_inode:
dput(sb->s_root);
@@ -3410,6 +3457,7 @@ free_node_inode:
f2fs_release_ino_entry(sbi, true);
truncate_inode_pages_final(NODE_MAPPING(sbi));
iput(sbi->node_inode);
+ sbi->node_inode = NULL;
free_stats:
f2fs_destroy_stats(sbi);
free_nm:
@@ -3422,6 +3470,7 @@ free_devices:
free_meta_inode:
make_bad_inode(sbi->meta_inode);
iput(sbi->meta_inode);
+ sbi->meta_inode = NULL;
free_io_dummy:
mempool_destroy(sbi->write_io_dummy);
free_percpu:
@@ -3443,8 +3492,8 @@ free_sbi:
kvfree(sbi);
/* give only one another chance */
- if (retry) {
- retry = false;
+ if (retry_cnt > 0 && skip_recovery) {
+ retry_cnt--;
shrink_dcache_sb(sb);
goto try_onemore;
}
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 70da6801c86f..729f46a3c9ee 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -222,6 +222,8 @@ out:
#ifdef CONFIG_F2FS_FAULT_INJECTION
if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX))
return -EINVAL;
+ if (a->struct_type == FAULT_INFO_RATE && t >= UINT_MAX)
+ return -EINVAL;
#endif
if (a->struct_type == RESERVED_BLOCKS) {
spin_lock(&sbi->stat_lock);
@@ -278,10 +280,16 @@ out:
return count;
}
- *ui = t;
- if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0)
- f2fs_reset_iostat(sbi);
+ if (!strcmp(a->attr.name, "iostat_enable")) {
+ sbi->iostat_enable = !!t;
+ if (!sbi->iostat_enable)
+ f2fs_reset_iostat(sbi);
+ return count;
+ }
+
+ *ui = (unsigned int)t;
+
return count;
}
@@ -418,6 +426,8 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, discard_idle_interval,
interval_time[DISCARD_TIME]);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle_interval, interval_time[GC_TIME]);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info,
+ umount_discard_timeout, interval_time[UMOUNT_DISCARD_TIMEOUT]);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold);
@@ -475,6 +485,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(idle_interval),
ATTR_LIST(discard_idle_interval),
ATTR_LIST(gc_idle_interval),
+ ATTR_LIST(umount_discard_timeout),
ATTR_LIST(iostat_enable),
ATTR_LIST(readdir_ra),
ATTR_LIST(gc_pin_file_thresh),
diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c
index ce2a5eb210b6..d0ab533a9ce8 100644
--- a/fs/f2fs/trace.c
+++ b/fs/f2fs/trace.c
@@ -14,7 +14,7 @@
#include "trace.h"
static RADIX_TREE(pids, GFP_ATOMIC);
-static struct mutex pids_lock;
+static spinlock_t pids_lock;
static struct last_io_info last_io;
static inline void __print_last_io(void)
@@ -58,23 +58,29 @@ void f2fs_trace_pid(struct page *page)
set_page_private(page, (unsigned long)pid);
+retry:
if (radix_tree_preload(GFP_NOFS))
return;
- mutex_lock(&pids_lock);
+ spin_lock(&pids_lock);
p = radix_tree_lookup(&pids, pid);
if (p == current)
goto out;
if (p)
radix_tree_delete(&pids, pid);
- f2fs_radix_tree_insert(&pids, pid, current);
+ if (radix_tree_insert(&pids, pid, current)) {
+ spin_unlock(&pids_lock);
+ radix_tree_preload_end();
+ cond_resched();
+ goto retry;
+ }
trace_printk("%3x:%3x %4x %-16s\n",
MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev),
pid, current->comm);
out:
- mutex_unlock(&pids_lock);
+ spin_unlock(&pids_lock);
radix_tree_preload_end();
}
@@ -119,7 +125,7 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush)
void f2fs_build_trace_ios(void)
{
- mutex_init(&pids_lock);
+ spin_lock_init(&pids_lock);
}
#define PIDVEC_SIZE 128
@@ -147,7 +153,7 @@ void f2fs_destroy_trace_ios(void)
pid_t next_pid = 0;
unsigned int found;
- mutex_lock(&pids_lock);
+ spin_lock(&pids_lock);
while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) {
unsigned idx;
@@ -155,5 +161,5 @@ void f2fs_destroy_trace_ios(void)
for (idx = 0; idx < found; idx++)
radix_tree_delete(&pids, pid[idx]);
}
- mutex_unlock(&pids_lock);
+ spin_unlock(&pids_lock);
}
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 18d5ffbc5e8c..848a785abe25 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -224,11 +224,11 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode,
{
struct f2fs_xattr_entry *entry;
unsigned int inline_size = inline_xattr_size(inode);
+ void *max_addr = base_addr + inline_size;
list_for_each_xattr(entry, base_addr) {
- if ((void *)entry + sizeof(__u32) > base_addr + inline_size ||
- (void *)XATTR_NEXT_ENTRY(entry) + sizeof(__u32) >
- base_addr + inline_size) {
+ if ((void *)entry + sizeof(__u32) > max_addr ||
+ (void *)XATTR_NEXT_ENTRY(entry) > max_addr) {
*last_addr = entry;
return NULL;
}
@@ -239,6 +239,13 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode,
if (!memcmp(entry->e_name, name, len))
break;
}
+
+ /* inline xattr header or entry across max inline xattr size */
+ if (IS_XATTR_LAST_ENTRY(entry) &&
+ (void *)entry + sizeof(__u32) > max_addr) {
+ *last_addr = entry;
+ return NULL;
+ }
return entry;
}
@@ -340,7 +347,7 @@ check:
*base_addr = txattr_addr;
return 0;
out:
- kzfree(txattr_addr);
+ kvfree(txattr_addr);
return err;
}
@@ -383,7 +390,7 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage,
*base_addr = txattr_addr;
return 0;
fail:
- kzfree(txattr_addr);
+ kvfree(txattr_addr);
return err;
}
@@ -510,7 +517,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
}
error = size;
out:
- kzfree(base_addr);
+ kvfree(base_addr);
return error;
}
@@ -538,7 +545,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
if (!handler || (handler->list && !handler->list(dentry)))
continue;
- prefix = handler->prefix ?: handler->name;
+ prefix = xattr_prefix(handler);
prefix_len = strlen(prefix);
size = prefix_len + entry->e_name_len + 1;
if (buffer) {
@@ -556,7 +563,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
}
error = buffer_size - rest;
cleanup:
- kzfree(base_addr);
+ kvfree(base_addr);
return error;
}
@@ -687,7 +694,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
if (!error && S_ISDIR(inode->i_mode))
set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP);
exit:
- kzfree(base_addr);
+ kvfree(base_addr);
return error;
}
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 67db134da0f5..9172ee082ca8 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -78,6 +78,12 @@ struct f2fs_xattr_entry {
sizeof(struct f2fs_xattr_header) - \
sizeof(struct f2fs_xattr_entry))
+#define MAX_INLINE_XATTR_SIZE \
+ (DEF_ADDRS_PER_INODE - \
+ F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) - \
+ DEF_INLINE_RESERVED_SIZE - \
+ MIN_INLINE_DENTRY_SIZE / sizeof(__le32))
+
/*
* On-disk structure of f2fs_xattr
* We use inline xattrs space + 1 block for xattr.
diff --git a/fs/fs_parser.c b/fs/fs_parser.c
index 842e8f749db6..570d71043acf 100644
--- a/fs/fs_parser.c
+++ b/fs/fs_parser.c
@@ -410,7 +410,7 @@ bool fs_validate_description(const struct fs_parameter_description *desc)
for (param = desc->specs; param->name; param++) {
if (param->opt == e->opt &&
param->type != fs_param_is_enum) {
- pr_err("VALIDATE %s: e[%lu] enum val for %s\n",
+ pr_err("VALIDATE %s: e[%tu] enum val for %s\n",
name, e - desc->enums, param->name);
good = false;
}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 8a63e52785e9..9971a35cf1ef 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -2056,10 +2056,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
ret = -EINVAL;
- if (rem < len) {
- pipe_unlock(pipe);
- goto out;
- }
+ if (rem < len)
+ goto out_free;
rem = len;
while (rem) {
@@ -2077,7 +2075,9 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
pipe->nrbufs--;
} else {
- pipe_buf_get(pipe, ibuf);
+ if (!pipe_buf_get(pipe, ibuf))
+ goto out_free;
+
*obuf = *ibuf;
obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
obuf->len = rem;
@@ -2100,11 +2100,11 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
ret = fuse_dev_do_write(fud, &cs, len);
pipe_lock(pipe);
+out_free:
for (idx = 0; idx < nbuf; idx++)
pipe_buf_release(pipe, &bufs[idx]);
pipe_unlock(pipe);
-out:
kvfree(bufs);
return ret;
}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index ec32fece5e1e..9285dd4f4b1c 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -755,11 +755,17 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
umode_t mode, dev_t dev)
{
struct inode *inode;
- struct resv_map *resv_map;
+ struct resv_map *resv_map = NULL;
- resv_map = resv_map_alloc();
- if (!resv_map)
- return NULL;
+ /*
+ * Reserve maps are only needed for inodes that can have associated
+ * page allocations.
+ */
+ if (S_ISREG(mode) || S_ISLNK(mode)) {
+ resv_map = resv_map_alloc();
+ if (!resv_map)
+ return NULL;
+ }
inode = new_inode(sb);
if (inode) {
@@ -794,8 +800,10 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
break;
}
lockdep_annotate_inode_mutex_key(inode);
- } else
- kref_put(&resv_map->refs, resv_map_release);
+ } else {
+ if (resv_map)
+ kref_put(&resv_map->refs, resv_map_release);
+ }
return inode;
}
diff --git a/fs/io_uring.c b/fs/io_uring.c
index c88088d92613..f65f85d89217 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -189,17 +189,28 @@ struct sqe_submit {
bool needs_fixed_file;
};
+/*
+ * First field must be the file pointer in all the
+ * iocb unions! See also 'struct kiocb' in <linux/fs.h>
+ */
struct io_poll_iocb {
struct file *file;
struct wait_queue_head *head;
__poll_t events;
- bool woken;
+ bool done;
bool canceled;
struct wait_queue_entry wait;
};
+/*
+ * NOTE! Each of the iocb union members has the file pointer
+ * as the first entry in their struct definition. So you can
+ * access the file pointer through any of the sub-structs,
+ * or directly as just 'ki_filp' in this struct.
+ */
struct io_kiocb {
union {
+ struct file *file;
struct kiocb rw;
struct io_poll_iocb poll;
};
@@ -214,6 +225,7 @@ struct io_kiocb {
#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */
#define REQ_F_FIXED_FILE 4 /* ctx owns file */
#define REQ_F_SEQ_PREV 8 /* sequential with previous */
+#define REQ_F_PREPPED 16 /* prep already done */
u64 user_data;
u64 error;
@@ -326,7 +338,7 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
tail = ctx->cached_cq_tail;
/* See comment at the top of the file */
smp_rmb();
- if (tail + 1 == READ_ONCE(ring->r.head))
+ if (tail - READ_ONCE(ring->r.head) == ring->ring_entries)
return NULL;
ctx->cached_cq_tail++;
@@ -355,20 +367,25 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
}
}
-static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data,
+static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+{
+ if (waitqueue_active(&ctx->wait))
+ wake_up(&ctx->wait);
+ if (waitqueue_active(&ctx->sqo_wait))
+ wake_up(&ctx->sqo_wait);
+}
+
+static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data,
long res, unsigned ev_flags)
{
unsigned long flags;
spin_lock_irqsave(&ctx->completion_lock, flags);
- io_cqring_fill_event(ctx, ki_user_data, res, ev_flags);
+ io_cqring_fill_event(ctx, user_data, res, ev_flags);
io_commit_cqring(ctx);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
- if (waitqueue_active(&ctx->wait))
- wake_up(&ctx->wait);
- if (waitqueue_active(&ctx->sqo_wait))
- wake_up(&ctx->sqo_wait);
+ io_cqring_ev_posted(ctx);
}
static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
@@ -382,13 +399,14 @@ static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
struct io_submit_state *state)
{
+ gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
struct io_kiocb *req;
if (!percpu_ref_tryget(&ctx->refs))
return NULL;
if (!state) {
- req = kmem_cache_alloc(req_cachep, __GFP_NOWARN);
+ req = kmem_cache_alloc(req_cachep, gfp);
if (unlikely(!req))
goto out;
} else if (!state->free_reqs) {
@@ -396,10 +414,18 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
int ret;
sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
- ret = kmem_cache_alloc_bulk(req_cachep, __GFP_NOWARN, sz,
- state->reqs);
- if (unlikely(ret <= 0))
- goto out;
+ ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
+
+ /*
+ * Bulk alloc is all-or-nothing. If we fail to get a batch,
+ * retry single alloc to be on the safe side.
+ */
+ if (unlikely(ret <= 0)) {
+ state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
+ if (!state->reqs[0])
+ goto out;
+ ret = 1;
+ }
state->free_reqs = ret - 1;
state->cur_req = 1;
req = state->reqs[0];
@@ -411,7 +437,8 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
req->ctx = ctx;
req->flags = 0;
- refcount_set(&req->refs, 0);
+ /* one is dropped after submission, the other at completion */
+ refcount_set(&req->refs, 2);
return req;
out:
io_ring_drop_ctx_refs(ctx, 1);
@@ -429,10 +456,16 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
static void io_free_req(struct io_kiocb *req)
{
- if (!refcount_read(&req->refs) || refcount_dec_and_test(&req->refs)) {
- io_ring_drop_ctx_refs(req->ctx, 1);
- kmem_cache_free(req_cachep, req);
- }
+ if (req->file && !(req->flags & REQ_F_FIXED_FILE))
+ fput(req->file);
+ io_ring_drop_ctx_refs(req->ctx, 1);
+ kmem_cache_free(req_cachep, req);
+}
+
+static void io_put_req(struct io_kiocb *req)
+{
+ if (refcount_dec_and_test(&req->refs))
+ io_free_req(req);
}
/*
@@ -442,44 +475,34 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
struct list_head *done)
{
void *reqs[IO_IOPOLL_BATCH];
- int file_count, to_free;
- struct file *file = NULL;
struct io_kiocb *req;
+ int to_free;
- file_count = to_free = 0;
+ to_free = 0;
while (!list_empty(done)) {
req = list_first_entry(done, struct io_kiocb, list);
list_del(&req->list);
io_cqring_fill_event(ctx, req->user_data, req->error, 0);
-
- reqs[to_free++] = req;
(*nr_events)++;
- /*
- * Batched puts of the same file, to avoid dirtying the
- * file usage count multiple times, if avoidable.
- */
- if (!(req->flags & REQ_F_FIXED_FILE)) {
- if (!file) {
- file = req->rw.ki_filp;
- file_count = 1;
- } else if (file == req->rw.ki_filp) {
- file_count++;
+ if (refcount_dec_and_test(&req->refs)) {
+ /* If we're not using fixed files, we have to pair the
+ * completion part with the file put. Use regular
+ * completions for those, only batch free for fixed
+ * file.
+ */
+ if (req->flags & REQ_F_FIXED_FILE) {
+ reqs[to_free++] = req;
+ if (to_free == ARRAY_SIZE(reqs))
+ io_free_req_many(ctx, reqs, &to_free);
} else {
- fput_many(file, file_count);
- file = req->rw.ki_filp;
- file_count = 1;
+ io_free_req(req);
}
}
-
- if (to_free == ARRAY_SIZE(reqs))
- io_free_req_many(ctx, reqs, &to_free);
}
- io_commit_cqring(ctx);
- if (file)
- fput_many(file, file_count);
+ io_commit_cqring(ctx);
io_free_req_many(ctx, reqs, &to_free);
}
@@ -602,21 +625,14 @@ static void kiocb_end_write(struct kiocb *kiocb)
}
}
-static void io_fput(struct io_kiocb *req)
-{
- if (!(req->flags & REQ_F_FIXED_FILE))
- fput(req->rw.ki_filp);
-}
-
static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
kiocb_end_write(kiocb);
- io_fput(req);
io_cqring_add_event(req->ctx, req->user_data, res, 0);
- io_free_req(req);
+ io_put_req(req);
}
static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
@@ -666,11 +682,9 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
list_add_tail(&req->list, &ctx->poll_list);
}
-static void io_file_put(struct io_submit_state *state, struct file *file)
+static void io_file_put(struct io_submit_state *state)
{
- if (!state) {
- fput(file);
- } else if (state->file) {
+ if (state->file) {
int diff = state->has_refs - state->used_refs;
if (diff)
@@ -695,7 +709,7 @@ static struct file *io_file_get(struct io_submit_state *state, int fd)
state->ios_left--;
return state->file;
}
- io_file_put(state, NULL);
+ io_file_put(state);
}
state->file = fget_many(fd, state->ios_left);
if (!state->file)
@@ -731,31 +745,18 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
const struct io_uring_sqe *sqe = s->sqe;
struct io_ring_ctx *ctx = req->ctx;
struct kiocb *kiocb = &req->rw;
- unsigned ioprio, flags;
- int fd, ret;
+ unsigned ioprio;
+ int ret;
+ if (!req->file)
+ return -EBADF;
/* For -EAGAIN retry, everything is already prepped */
- if (kiocb->ki_filp)
+ if (req->flags & REQ_F_PREPPED)
return 0;
- flags = READ_ONCE(sqe->flags);
- fd = READ_ONCE(sqe->fd);
+ if (force_nonblock && !io_file_supports_async(req->file))
+ force_nonblock = false;
- if (flags & IOSQE_FIXED_FILE) {
- if (unlikely(!ctx->user_files ||
- (unsigned) fd >= ctx->nr_user_files))
- return -EBADF;
- kiocb->ki_filp = ctx->user_files[fd];
- req->flags |= REQ_F_FIXED_FILE;
- } else {
- if (s->needs_fixed_file)
- return -EBADF;
- kiocb->ki_filp = io_file_get(state, fd);
- if (unlikely(!kiocb->ki_filp))
- return -EBADF;
- if (force_nonblock && !io_file_supports_async(kiocb->ki_filp))
- force_nonblock = false;
- }
kiocb->ki_pos = READ_ONCE(sqe->off);
kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
@@ -764,7 +765,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
if (ioprio) {
ret = ioprio_check_cap(ioprio);
if (ret)
- goto out_fput;
+ return ret;
kiocb->ki_ioprio = ioprio;
} else
@@ -772,38 +773,26 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
if (unlikely(ret))
- goto out_fput;
+ return ret;
if (force_nonblock) {
kiocb->ki_flags |= IOCB_NOWAIT;
req->flags |= REQ_F_FORCE_NONBLOCK;
}
if (ctx->flags & IORING_SETUP_IOPOLL) {
- ret = -EOPNOTSUPP;
if (!(kiocb->ki_flags & IOCB_DIRECT) ||
!kiocb->ki_filp->f_op->iopoll)
- goto out_fput;
+ return -EOPNOTSUPP;
req->error = 0;
kiocb->ki_flags |= IOCB_HIPRI;
kiocb->ki_complete = io_complete_rw_iopoll;
} else {
- if (kiocb->ki_flags & IOCB_HIPRI) {
- ret = -EINVAL;
- goto out_fput;
- }
+ if (kiocb->ki_flags & IOCB_HIPRI)
+ return -EINVAL;
kiocb->ki_complete = io_complete_rw;
}
+ req->flags |= REQ_F_PREPPED;
return 0;
-out_fput:
- if (!(flags & IOSQE_FIXED_FILE)) {
- /*
- * in case of error, we didn't use this file reference. drop it.
- */
- if (state)
- state->used_refs--;
- io_file_put(state, kiocb->ki_filp);
- }
- return ret;
}
static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
@@ -864,6 +853,9 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
if (offset)
iov_iter_advance(iter, offset);
+
+ /* don't drop a reference to these pages */
+ iter->type |= ITER_BVEC_FLAG_NO_REF;
return 0;
}
@@ -887,7 +879,7 @@ static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
opcode = READ_ONCE(sqe->opcode);
if (opcode == IORING_OP_READ_FIXED ||
opcode == IORING_OP_WRITE_FIXED) {
- ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
+ int ret = io_import_fixed(ctx, rw, sqe, iter);
*iovec = NULL;
return ret;
}
@@ -945,31 +937,29 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
async_list->io_end = io_end;
}
-static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s,
- bool force_nonblock, struct io_submit_state *state)
+static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
+ bool force_nonblock, struct io_submit_state *state)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct kiocb *kiocb = &req->rw;
struct iov_iter iter;
struct file *file;
size_t iov_count;
- ssize_t ret;
+ int ret;
ret = io_prep_rw(req, s, force_nonblock, state);
if (ret)
return ret;
file = kiocb->ki_filp;
- ret = -EBADF;
if (unlikely(!(file->f_mode & FMODE_READ)))
- goto out_fput;
- ret = -EINVAL;
+ return -EBADF;
if (unlikely(!file->f_op->read_iter))
- goto out_fput;
+ return -EINVAL;
ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
if (ret)
- goto out_fput;
+ return ret;
iov_count = iov_iter_count(&iter);
ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count);
@@ -991,38 +981,32 @@ static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s,
}
}
kfree(iovec);
-out_fput:
- /* Hold on to the file for -EAGAIN */
- if (unlikely(ret && ret != -EAGAIN))
- io_fput(req);
return ret;
}
-static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s,
- bool force_nonblock, struct io_submit_state *state)
+static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
+ bool force_nonblock, struct io_submit_state *state)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct kiocb *kiocb = &req->rw;
struct iov_iter iter;
struct file *file;
size_t iov_count;
- ssize_t ret;
+ int ret;
ret = io_prep_rw(req, s, force_nonblock, state);
if (ret)
return ret;
- ret = -EBADF;
file = kiocb->ki_filp;
if (unlikely(!(file->f_mode & FMODE_WRITE)))
- goto out_fput;
- ret = -EINVAL;
+ return -EBADF;
if (unlikely(!file->f_op->write_iter))
- goto out_fput;
+ return -EINVAL;
ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
if (ret)
- goto out_fput;
+ return ret;
iov_count = iov_iter_count(&iter);
@@ -1036,6 +1020,8 @@ static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s,
ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count);
if (!ret) {
+ ssize_t ret2;
+
/*
* Open-code file_start_write here to grab freeze protection,
* which will be released by another thread in
@@ -1050,14 +1036,22 @@ static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s,
SB_FREEZE_WRITE);
}
kiocb->ki_flags |= IOCB_WRITE;
- io_rw_done(kiocb, call_write_iter(file, kiocb, &iter));
+
+ ret2 = call_write_iter(file, kiocb, &iter);
+ if (!force_nonblock || ret2 != -EAGAIN) {
+ io_rw_done(kiocb, ret2);
+ } else {
+ /*
+ * If ->needs_lock is true, we're already in async
+ * context.
+ */
+ if (!s->needs_lock)
+ io_async_list_note(WRITE, req, iov_count);
+ ret = -EAGAIN;
+ }
}
out_free:
kfree(iovec);
-out_fput:
- /* Hold on to the file for -EAGAIN */
- if (unlikely(ret && ret != -EAGAIN))
- io_fput(req);
return ret;
}
@@ -1072,29 +1066,19 @@ static int io_nop(struct io_kiocb *req, u64 user_data)
if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- /*
- * Twilight zone - it's possible that someone issued an opcode that
- * has a file attached, then got -EAGAIN on submission, and changed
- * the sqe before we retried it from async context. Avoid dropping
- * a file reference for this malicious case, and flag the error.
- */
- if (req->rw.ki_filp) {
- err = -EBADF;
- io_fput(req);
- }
io_cqring_add_event(ctx, user_data, err, 0);
- io_free_req(req);
+ io_put_req(req);
return 0;
}
static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
- unsigned flags;
- int fd;
- /* Prep already done */
- if (req->rw.ki_filp)
+ if (!req->file)
+ return -EBADF;
+ /* Prep already done (EAGAIN retry) */
+ if (req->flags & REQ_F_PREPPED)
return 0;
if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
@@ -1102,20 +1086,7 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
return -EINVAL;
- fd = READ_ONCE(sqe->fd);
- flags = READ_ONCE(sqe->flags);
-
- if (flags & IOSQE_FIXED_FILE) {
- if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files))
- return -EBADF;
- req->rw.ki_filp = ctx->user_files[fd];
- req->flags |= REQ_F_FIXED_FILE;
- } else {
- req->rw.ki_filp = fget(fd);
- if (unlikely(!req->rw.ki_filp))
- return -EBADF;
- }
-
+ req->flags |= REQ_F_PREPPED;
return 0;
}
@@ -1144,9 +1115,8 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
end > 0 ? end : LLONG_MAX,
fsync_flags & IORING_FSYNC_DATASYNC);
- io_fput(req);
io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
- io_free_req(req);
+ io_put_req(req);
return 0;
}
@@ -1204,15 +1174,16 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
spin_unlock_irq(&ctx->completion_lock);
io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
- io_free_req(req);
+ io_put_req(req);
return 0;
}
-static void io_poll_complete(struct io_kiocb *req, __poll_t mask)
+static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req,
+ __poll_t mask)
{
- io_cqring_add_event(req->ctx, req->user_data, mangle_poll(mask), 0);
- io_fput(req);
- io_free_req(req);
+ req->poll.done = true;
+ io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask), 0);
+ io_commit_cqring(ctx);
}
static void io_poll_complete_work(struct work_struct *work)
@@ -1240,9 +1211,11 @@ static void io_poll_complete_work(struct work_struct *work)
return;
}
list_del_init(&req->list);
+ io_poll_complete(ctx, req, mask);
spin_unlock_irq(&ctx->completion_lock);
- io_poll_complete(req, mask);
+ io_cqring_ev_posted(ctx);
+ io_put_req(req);
}
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
@@ -1253,29 +1226,25 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
struct io_ring_ctx *ctx = req->ctx;
__poll_t mask = key_to_poll(key);
-
- poll->woken = true;
+ unsigned long flags;
/* for instances that support it check for an event match first: */
- if (mask) {
- unsigned long flags;
+ if (mask && !(mask & poll->events))
+ return 0;
- if (!(mask & poll->events))
- return 0;
+ list_del_init(&poll->wait.entry);
- /* try to complete the iocb inline if we can: */
- if (spin_trylock_irqsave(&ctx->completion_lock, flags)) {
- list_del(&req->list);
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
+ list_del(&req->list);
+ io_poll_complete(ctx, req, mask);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
- list_del_init(&poll->wait.entry);
- io_poll_complete(req, mask);
- return 1;
- }
+ io_cqring_ev_posted(ctx);
+ io_put_req(req);
+ } else {
+ queue_work(ctx->sqo_wq, &req->work);
}
- list_del_init(&poll->wait.entry);
- queue_work(ctx->sqo_wq, &req->work);
return 1;
}
@@ -1305,36 +1274,23 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_poll_iocb *poll = &req->poll;
struct io_ring_ctx *ctx = req->ctx;
struct io_poll_table ipt;
- unsigned flags;
+ bool cancel = false;
__poll_t mask;
u16 events;
- int fd;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
return -EINVAL;
+ if (!poll->file)
+ return -EBADF;
INIT_WORK(&req->work, io_poll_complete_work);
events = READ_ONCE(sqe->poll_events);
poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
- flags = READ_ONCE(sqe->flags);
- fd = READ_ONCE(sqe->fd);
-
- if (flags & IOSQE_FIXED_FILE) {
- if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files))
- return -EBADF;
- poll->file = ctx->user_files[fd];
- req->flags |= REQ_F_FIXED_FILE;
- } else {
- poll->file = fget(fd);
- }
- if (unlikely(!poll->file))
- return -EBADF;
-
poll->head = NULL;
- poll->woken = false;
+ poll->done = false;
poll->canceled = false;
ipt.pt._qproc = io_poll_queue_proc;
@@ -1346,56 +1302,44 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
INIT_LIST_HEAD(&poll->wait.entry);
init_waitqueue_func_entry(&poll->wait, io_poll_wake);
- /* one for removal from waitqueue, one for this function */
- refcount_set(&req->refs, 2);
-
mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
- if (unlikely(!poll->head)) {
- /* we did not manage to set up a waitqueue, done */
- goto out;
- }
spin_lock_irq(&ctx->completion_lock);
- spin_lock(&poll->head->lock);
- if (poll->woken) {
- /* wake_up context handles the rest */
- mask = 0;
+ if (likely(poll->head)) {
+ spin_lock(&poll->head->lock);
+ if (unlikely(list_empty(&poll->wait.entry))) {
+ if (ipt.error)
+ cancel = true;
+ ipt.error = 0;
+ mask = 0;
+ }
+ if (mask || ipt.error)
+ list_del_init(&poll->wait.entry);
+ else if (cancel)
+ WRITE_ONCE(poll->canceled, true);
+ else if (!poll->done) /* actually waiting for an event */
+ list_add_tail(&req->list, &ctx->cancel_list);
+ spin_unlock(&poll->head->lock);
+ }
+ if (mask) { /* no async, we'd stolen it */
+ req->error = mangle_poll(mask);
ipt.error = 0;
- } else if (mask || ipt.error) {
- /* if we get an error or a mask we are done */
- WARN_ON_ONCE(list_empty(&poll->wait.entry));
- list_del_init(&poll->wait.entry);
- } else {
- /* actually waiting for an event */
- list_add_tail(&req->list, &ctx->cancel_list);
+ io_poll_complete(ctx, req, mask);
}
- spin_unlock(&poll->head->lock);
spin_unlock_irq(&ctx->completion_lock);
-out:
- if (unlikely(ipt.error)) {
- if (!(flags & IOSQE_FIXED_FILE))
- fput(poll->file);
- /*
- * Drop one of our refs to this req, __io_submit_sqe() will
- * drop the other one since we're returning an error.
- */
- io_free_req(req);
- return ipt.error;
+ if (mask) {
+ io_cqring_ev_posted(ctx);
+ io_put_req(req);
}
-
- if (mask)
- io_poll_complete(req, mask);
- io_free_req(req);
- return 0;
+ return ipt.error;
}
static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
const struct sqe_submit *s, bool force_nonblock,
struct io_submit_state *state)
{
- ssize_t ret;
- int opcode;
+ int ret, opcode;
if (unlikely(s->index >= ctx->sq_entries))
return -EINVAL;
@@ -1524,10 +1468,13 @@ restart:
break;
cond_resched();
} while (1);
+
+ /* drop submission reference */
+ io_put_req(req);
}
if (ret) {
io_cqring_add_event(ctx, sqe->user_data, ret, 0);
- io_free_req(req);
+ io_put_req(req);
}
/* async context always use a copy of the sqe */
@@ -1614,11 +1561,55 @@ static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req)
return ret;
}
+static bool io_op_needs_file(const struct io_uring_sqe *sqe)
+{
+ int op = READ_ONCE(sqe->opcode);
+
+ switch (op) {
+ case IORING_OP_NOP:
+ case IORING_OP_POLL_REMOVE:
+ return false;
+ default:
+ return true;
+ }
+}
+
+static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
+ struct io_submit_state *state, struct io_kiocb *req)
+{
+ unsigned flags;
+ int fd;
+
+ flags = READ_ONCE(s->sqe->flags);
+ fd = READ_ONCE(s->sqe->fd);
+
+ if (!io_op_needs_file(s->sqe)) {
+ req->file = NULL;
+ return 0;
+ }
+
+ if (flags & IOSQE_FIXED_FILE) {
+ if (unlikely(!ctx->user_files ||
+ (unsigned) fd >= ctx->nr_user_files))
+ return -EBADF;
+ req->file = ctx->user_files[fd];
+ req->flags |= REQ_F_FIXED_FILE;
+ } else {
+ if (s->needs_fixed_file)
+ return -EBADF;
+ req->file = io_file_get(state, fd);
+ if (unlikely(!req->file))
+ return -EBADF;
+ }
+
+ return 0;
+}
+
static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
struct io_submit_state *state)
{
struct io_kiocb *req;
- ssize_t ret;
+ int ret;
/* enforce forwards compatibility on users */
if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE))
@@ -1628,7 +1619,9 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
if (unlikely(!req))
return -EAGAIN;
- req->rw.ki_filp = NULL;
+ ret = io_req_set_file(ctx, s, state, req);
+ if (unlikely(ret))
+ goto out;
ret = __io_submit_sqe(ctx, req, s, true, state);
if (ret == -EAGAIN) {
@@ -1649,11 +1642,23 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
INIT_WORK(&req->work, io_sq_wq_submit_work);
queue_work(ctx->sqo_wq, &req->work);
}
- ret = 0;
+
+ /*
+ * Queued up for async execution, worker will release
+ * submit reference when the iocb is actually
+ * submitted.
+ */
+ return 0;
}
}
+
+out:
+ /* drop submission reference */
+ io_put_req(req);
+
+ /* and drop final reference, if we failed */
if (ret)
- io_free_req(req);
+ io_put_req(req);
return ret;
}
@@ -1664,7 +1669,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
static void io_submit_state_end(struct io_submit_state *state)
{
blk_finish_plug(&state->plug);
- io_file_put(state, NULL);
+ io_file_put(state);
if (state->free_reqs)
kmem_cache_free_bulk(req_cachep, state->free_reqs,
&state->reqs[state->cur_req]);
@@ -1913,6 +1918,10 @@ static int io_sq_thread(void *data)
unuse_mm(cur_mm);
mmput(cur_mm);
}
+
+ if (kthread_should_park())
+ kthread_parkme();
+
return 0;
}
@@ -1975,7 +1984,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return 0;
if (sig) {
- ret = set_user_sigmask(sig, &ksigmask, &sigsaved, sigsz);
+#ifdef CONFIG_COMPAT
+ if (in_compat_syscall())
+ ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
+ &ksigmask, &sigsaved, sigsz);
+ else
+#endif
+ ret = set_user_sigmask(sig, &ksigmask,
+ &sigsaved, sigsz);
+
if (ret)
return ret;
}
@@ -2039,6 +2056,7 @@ static void io_sq_thread_stop(struct io_ring_ctx *ctx)
if (ctx->sqo_thread) {
ctx->sqo_stop = 1;
mb();
+ kthread_park(ctx->sqo_thread);
kthread_stop(ctx->sqo_thread);
ctx->sqo_thread = NULL;
}
@@ -2200,6 +2218,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
fput(ctx->user_files[i]);
kfree(ctx->user_files);
+ ctx->user_files = NULL;
ctx->nr_user_files = 0;
return ret;
}
@@ -2220,19 +2239,27 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
mmgrab(current->mm);
ctx->sqo_mm = current->mm;
- ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
- if (!ctx->sq_thread_idle)
- ctx->sq_thread_idle = HZ;
-
ret = -EINVAL;
if (!cpu_possible(p->sq_thread_cpu))
goto err;
if (ctx->flags & IORING_SETUP_SQPOLL) {
+ ret = -EPERM;
+ if (!capable(CAP_SYS_ADMIN))
+ goto err;
+
+ ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
+ if (!ctx->sq_thread_idle)
+ ctx->sq_thread_idle = HZ;
+
if (p->flags & IORING_SETUP_SQ_AFF) {
int cpu;
cpu = array_index_nospec(p->sq_thread_cpu, NR_CPUS);
+ ret = -EINVAL;
+ if (!cpu_possible(p->sq_thread_cpu))
+ goto err;
+
ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
ctx, cpu,
"io_uring-sq");
@@ -2902,11 +2929,23 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries,
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
void __user *arg, unsigned nr_args)
+ __releases(ctx->uring_lock)
+ __acquires(ctx->uring_lock)
{
int ret;
percpu_ref_kill(&ctx->refs);
+
+ /*
+ * Drop uring mutex before waiting for references to exit. If another
+ * thread is currently inside io_uring_enter() it might need to grab
+ * the uring_lock to make progress. If we hold it here across the drain
+ * wait, then we can deadlock. It's safe to drop the mutex here, since
+ * no new references will come in after we've killed the percpu ref.
+ */
+ mutex_unlock(&ctx->uring_lock);
wait_for_completion(&ctx->ctx_done);
+ mutex_lock(&ctx->uring_lock);
switch (opcode) {
case IORING_REGISTER_BUFFERS:
diff --git a/fs/iomap.c b/fs/iomap.c
index 97cb9d486a7d..abdd18e404f8 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1589,12 +1589,14 @@ static void iomap_dio_bio_end_io(struct bio *bio)
if (should_dirty) {
bio_check_pages_dirty(bio);
} else {
- struct bio_vec *bvec;
- int i;
- struct bvec_iter_all iter_all;
+ if (!bio_flagged(bio, BIO_NO_PAGE_REF)) {
+ struct bvec_iter_all iter_all;
+ struct bio_vec *bvec;
+ int i;
- bio_for_each_segment_all(bvec, bio, i, iter_all)
- put_page(bvec->bv_page);
+ bio_for_each_segment_all(bvec, bio, i, iter_all)
+ put_page(bvec->bv_page);
+ }
bio_put(bio);
}
}
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 389ea53ea487..bccfc40b3a74 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -1414,11 +1414,6 @@ void jffs2_do_clear_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f)
jffs2_kill_fragtree(&f->fragtree, deleted?c:NULL);
- if (f->target) {
- kfree(f->target);
- f->target = NULL;
- }
-
fds = f->dents;
while(fds) {
fd = fds;
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index bb6ae387469f..05d892c79339 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -47,7 +47,10 @@ static struct inode *jffs2_alloc_inode(struct super_block *sb)
static void jffs2_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode));
+ struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+
+ kfree(f->target);
+ kmem_cache_free(jffs2_inode_cachep, f);
}
static void jffs2_destroy_inode(struct inode *inode)
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 93fb7cf0b92b..f0b5c987d6ae 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -290,12 +290,11 @@ void nlmclnt_release_host(struct nlm_host *host)
WARN_ON_ONCE(host->h_server);
- if (refcount_dec_and_test(&host->h_count)) {
+ if (refcount_dec_and_mutex_lock(&host->h_count, &nlm_host_mutex)) {
WARN_ON_ONCE(!list_empty(&host->h_lockowners));
WARN_ON_ONCE(!list_empty(&host->h_granted));
WARN_ON_ONCE(!list_empty(&host->h_reclaim));
- mutex_lock(&nlm_host_mutex);
nlm_destroy_host_locked(host);
mutex_unlock(&nlm_host_mutex);
}
diff --git a/fs/locks.c b/fs/locks.c
index eaa1cfaf73b0..71d0c6c2aac5 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1160,6 +1160,11 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
*/
error = -EDEADLK;
spin_lock(&blocked_lock_lock);
+ /*
+ * Ensure that we don't find any locks blocked on this
+ * request during deadlock detection.
+ */
+ __locks_wake_up_blocks(request);
if (likely(!posix_locks_deadlock(request, fl))) {
error = FILE_LOCK_DEFERRED;
__locks_insert_block(fl, request,
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index fb1cf1a4bda2..90d71fda65ce 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -453,7 +453,7 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
case XPRT_TRANSPORT_RDMA:
if (retrans == NFS_UNSPEC_RETRANS)
to->to_retries = NFS_DEF_TCP_RETRANS;
- if (timeo == NFS_UNSPEC_TIMEO || to->to_retries == 0)
+ if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0)
to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10;
if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
to->to_initval = NFS_MAX_TCP_TIMEOUT;
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index f9264e1922a2..6673d4ff5a2a 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1289,6 +1289,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
static int ff_layout_read_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
+ int new_idx = hdr->pgio_mirror_idx;
int err;
trace_nfs4_pnfs_read(hdr, task->tk_status);
@@ -1307,7 +1308,7 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
case -NFS4ERR_RESET_TO_PNFS:
if (ff_layout_choose_best_ds_for_read(hdr->lseg,
hdr->pgio_mirror_idx + 1,
- &hdr->pgio_mirror_idx))
+ &new_idx))
goto out_layouterror;
set_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags);
return task->tk_status;
@@ -1320,7 +1321,9 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
return 0;
out_layouterror:
+ ff_layout_read_record_layoutstats_done(task, hdr);
ff_layout_send_layouterror(hdr->lseg);
+ hdr->pgio_mirror_idx = new_idx;
out_eagain:
rpc_restart_call_prepare(task);
return -EAGAIN;
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index ff6f85fb676b..5196bfa7894d 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -329,9 +329,6 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src,
};
ssize_t err, err2;
- if (!nfs_server_capable(file_inode(dst), NFS_CAP_COPY))
- return -EOPNOTSUPP;
-
src_lock = nfs_get_lock_context(nfs_file_open_context(src));
if (IS_ERR(src_lock))
return PTR_ERR(src_lock);
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 45b2322e092d..00d17198ee12 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -133,8 +133,10 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
size_t count, unsigned int flags)
{
+ if (!nfs_server_capable(file_inode(file_out), NFS_CAP_COPY))
+ return -EOPNOTSUPP;
if (file_inode(file_in) == file_inode(file_out))
- return -EINVAL;
+ return -EOPNOTSUPP;
return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count);
}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4dbb0ee23432..741ff8c9c6ed 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2933,7 +2933,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
}
out:
- nfs4_sequence_free_slot(&opendata->o_res.seq_res);
+ if (!opendata->cancelled)
+ nfs4_sequence_free_slot(&opendata->o_res.seq_res);
return ret;
}
@@ -6301,7 +6302,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
p->arg.seqid = seqid;
p->res.seqid = seqid;
p->lsp = lsp;
- refcount_inc(&lsp->ls_count);
/* Ensure we don't close file until we're done freeing locks! */
p->ctx = get_nfs_open_context(ctx);
p->l_ctx = nfs_get_lock_context(ctx);
@@ -6526,7 +6526,6 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
p->res.lock_seqid = p->arg.lock_seqid;
p->lsp = lsp;
p->server = server;
- refcount_inc(&lsp->ls_count);
p->ctx = get_nfs_open_context(ctx);
locks_init_lock(&p->fl);
locks_copy_lock(&p->fl, fl);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index cfcabc33e24d..602446158bfb 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2589,7 +2589,7 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
ARRAY_SIZE(nfs4_acl_bitmap), &hdr);
rpc_prepare_reply_pages(req, args->acl_pages, 0,
- args->acl_len, replen);
+ args->acl_len, replen + 1);
encode_nops(&hdr);
}
@@ -2811,7 +2811,7 @@ static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req,
}
rpc_prepare_reply_pages(req, (struct page **)&args->page, 0,
- PAGE_SIZE, replen);
+ PAGE_SIZE, replen + 1);
encode_nops(&hdr);
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 8247bd1634cb..7066cd7c7aff 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1889,7 +1889,7 @@ lookup_again:
atomic_read(&lo->plh_outstanding) != 0) {
spin_unlock(&ino->i_lock);
lseg = ERR_PTR(wait_var_event_killable(&lo->plh_outstanding,
- atomic_read(&lo->plh_outstanding)));
+ !atomic_read(&lo->plh_outstanding)));
if (IS_ERR(lseg) || !list_empty(&lo->plh_segs))
goto out_put_layout_hdr;
pnfs_put_layout_hdr(lo);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 23790c7b2289..c27ac96a95bd 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2041,7 +2041,8 @@ static int nfs23_validate_mount_data(void *options,
memcpy(sap, &data->addr, sizeof(data->addr));
args->nfs_server.addrlen = sizeof(data->addr);
args->nfs_server.port = ntohs(data->addr.sin_port);
- if (!nfs_verify_server_address(sap))
+ if (sap->sa_family != AF_INET ||
+ !nfs_verify_server_address(sap))
goto out_no_address;
if (!(data->flags & NFS_MOUNT_TCP))
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 8f933e84cec1..9bc32af4e2da 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -442,7 +442,9 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp)
struct nfsd3_readdirargs *argp = rqstp->rq_argp;
struct nfsd3_readdirres *resp = rqstp->rq_resp;
__be32 nfserr;
- int count;
+ int count = 0;
+ struct page **p;
+ caddr_t page_addr = NULL;
dprintk("nfsd: READDIR(3) %s %d bytes at %d\n",
SVCFH_fmt(&argp->fh),
@@ -462,7 +464,18 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp)
nfserr = nfsd_readdir(rqstp, &resp->fh, (loff_t*) &argp->cookie,
&resp->common, nfs3svc_encode_entry);
memcpy(resp->verf, argp->verf, 8);
- resp->count = resp->buffer - argp->buffer;
+ count = 0;
+ for (p = rqstp->rq_respages + 1; p < rqstp->rq_next_page; p++) {
+ page_addr = page_address(*p);
+
+ if (((caddr_t)resp->buffer >= page_addr) &&
+ ((caddr_t)resp->buffer < page_addr + PAGE_SIZE)) {
+ count += (caddr_t)resp->buffer - page_addr;
+ break;
+ }
+ count += PAGE_SIZE;
+ }
+ resp->count = count >> 2;
if (resp->offset) {
loff_t offset = argp->cookie;
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 93fea246f676..8d789124ed3c 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -573,6 +573,7 @@ int
nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd3_readdirargs *args = rqstp->rq_argp;
+ int len;
u32 max_blocksize = svc_max_payload(rqstp);
p = decode_fh(p, &args->fh);
@@ -582,8 +583,14 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p)
args->verf = p; p += 2;
args->dircount = ~0;
args->count = ntohl(*p++);
- args->count = min_t(u32, args->count, max_blocksize);
- args->buffer = page_address(*(rqstp->rq_next_page++));
+ len = args->count = min_t(u32, args->count, max_blocksize);
+
+ while (len > 0) {
+ struct page *p = *(rqstp->rq_next_page++);
+ if (!args->buffer)
+ args->buffer = page_address(p);
+ len -= PAGE_SIZE;
+ }
return xdr_argsize_check(rqstp, p);
}
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index d219159b98af..7caa3801ce72 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1010,8 +1010,9 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
cb->cb_seq_status = 1;
cb->cb_status = 0;
if (minorversion) {
- if (!nfsd41_cb_get_slot(clp, task))
+ if (!cb->cb_holds_slot && !nfsd41_cb_get_slot(clp, task))
return;
+ cb->cb_holds_slot = true;
}
rpc_call_start(task);
}
@@ -1038,6 +1039,9 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
return true;
}
+ if (!cb->cb_holds_slot)
+ goto need_restart;
+
switch (cb->cb_seq_status) {
case 0:
/*
@@ -1076,6 +1080,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
cb->cb_seq_status);
}
+ cb->cb_holds_slot = false;
clear_bit(0, &clp->cl_cb_slot_busy);
rpc_wake_up_next(&clp->cl_cb_waitq);
dprintk("%s: freed slot, new seqid=%d\n", __func__,
@@ -1283,6 +1288,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
cb->cb_seq_status = 1;
cb->cb_status = 0;
cb->cb_need_restart = false;
+ cb->cb_holds_slot = false;
}
void nfsd4_run_cb(struct nfsd4_callback *cb)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 6a45fb00c5fc..f056b1d3fecd 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -265,6 +265,7 @@ find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh,
static void
free_blocked_lock(struct nfsd4_blocked_lock *nbl)
{
+ locks_delete_block(&nbl->nbl_lock);
locks_release_private(&nbl->nbl_lock);
kfree(nbl);
}
@@ -293,11 +294,18 @@ remove_blocked_locks(struct nfs4_lockowner *lo)
nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock,
nbl_lru);
list_del_init(&nbl->nbl_lru);
- locks_delete_block(&nbl->nbl_lock);
free_blocked_lock(nbl);
}
}
+static void
+nfsd4_cb_notify_lock_prepare(struct nfsd4_callback *cb)
+{
+ struct nfsd4_blocked_lock *nbl = container_of(cb,
+ struct nfsd4_blocked_lock, nbl_cb);
+ locks_delete_block(&nbl->nbl_lock);
+}
+
static int
nfsd4_cb_notify_lock_done(struct nfsd4_callback *cb, struct rpc_task *task)
{
@@ -325,6 +333,7 @@ nfsd4_cb_notify_lock_release(struct nfsd4_callback *cb)
}
static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops = {
+ .prepare = nfsd4_cb_notify_lock_prepare,
.done = nfsd4_cb_notify_lock_done,
.release = nfsd4_cb_notify_lock_release,
};
@@ -4863,7 +4872,6 @@ nfs4_laundromat(struct nfsd_net *nn)
nbl = list_first_entry(&reaplist,
struct nfsd4_blocked_lock, nbl_lru);
list_del_init(&nbl->nbl_lru);
- locks_delete_block(&nbl->nbl_lock);
free_blocked_lock(nbl);
}
out:
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 396c76755b03..9d6cb246c6c5 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -70,6 +70,7 @@ struct nfsd4_callback {
int cb_seq_status;
int cb_status;
bool cb_need_restart;
+ bool cb_holds_slot;
};
struct nfsd4_callback_ops {
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 56992b32c6bb..a90bb19dcfa2 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -208,6 +208,7 @@ static int copy_fid_to_user(struct fanotify_event *event, char __user *buf)
{
struct fanotify_event_info_fid info = { };
struct file_handle handle = { };
+ unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh;
size_t fh_len = event->fh_len;
size_t len = fanotify_event_info_len(event);
@@ -233,7 +234,16 @@ static int copy_fid_to_user(struct fanotify_event *event, char __user *buf)
buf += sizeof(handle);
len -= sizeof(handle);
- if (copy_to_user(buf, fanotify_event_fh(event), fh_len))
+ /*
+ * For an inline fh, copy through stack to exclude the copy from
+ * usercopy hardening protections.
+ */
+ fh = fanotify_event_fh(event);
+ if (fh_len <= FANOTIFY_INLINE_FH_LEN) {
+ memcpy(bounce, fh, fh_len);
+ fh = bounce;
+ }
+ if (copy_to_user(buf, fh, fh_len))
return -EFAULT;
/* Pad with 0's */
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index e2901fbb9f76..7b53598c8804 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -519,8 +519,10 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
if (!fsn_mark)
return -ENOENT;
- else if (create)
- return -EEXIST;
+ else if (create) {
+ ret = -EEXIST;
+ goto out;
+ }
i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
@@ -548,6 +550,7 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
/* return the wd */
ret = i_mark->wd;
+out:
/* match the get from fsnotify_find_mark() */
fsnotify_put_mark(fsn_mark);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index a35259eebc56..1dc9a08e8bdc 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4719,22 +4719,23 @@ out:
/* Lock an inode and grab a bh pointing to the inode. */
int ocfs2_reflink_inodes_lock(struct inode *s_inode,
- struct buffer_head **bh1,
+ struct buffer_head **bh_s,
struct inode *t_inode,
- struct buffer_head **bh2)
+ struct buffer_head **bh_t)
{
- struct inode *inode1;
- struct inode *inode2;
+ struct inode *inode1 = s_inode;
+ struct inode *inode2 = t_inode;
struct ocfs2_inode_info *oi1;
struct ocfs2_inode_info *oi2;
+ struct buffer_head *bh1 = NULL;
+ struct buffer_head *bh2 = NULL;
bool same_inode = (s_inode == t_inode);
+ bool need_swap = (inode1->i_ino > inode2->i_ino);
int status;
/* First grab the VFS and rw locks. */
lock_two_nondirectories(s_inode, t_inode);
- inode1 = s_inode;
- inode2 = t_inode;
- if (inode1->i_ino > inode2->i_ino)
+ if (need_swap)
swap(inode1, inode2);
status = ocfs2_rw_lock(inode1, 1);
@@ -4757,17 +4758,13 @@ int ocfs2_reflink_inodes_lock(struct inode *s_inode,
trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
(unsigned long long)oi2->ip_blkno);
- if (*bh1)
- *bh1 = NULL;
- if (*bh2)
- *bh2 = NULL;
-
/* We always want to lock the one with the lower lockid first. */
if (oi1->ip_blkno > oi2->ip_blkno)
mlog_errno(-ENOLCK);
/* lock id1 */
- status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_REFLINK_TARGET);
+ status = ocfs2_inode_lock_nested(inode1, &bh1, 1,
+ OI_LS_REFLINK_TARGET);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@@ -4776,15 +4773,25 @@ int ocfs2_reflink_inodes_lock(struct inode *s_inode,
/* lock id2 */
if (!same_inode) {
- status = ocfs2_inode_lock_nested(inode2, bh2, 1,
+ status = ocfs2_inode_lock_nested(inode2, &bh2, 1,
OI_LS_REFLINK_TARGET);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
goto out_cl1;
}
- } else
- *bh2 = *bh1;
+ } else {
+ bh2 = bh1;
+ }
+
+ /*
+ * If we swapped inode order above, we have to swap the buffer heads
+ * before passing them back to the caller.
+ */
+ if (need_swap)
+ swap(bh1, bh2);
+ *bh_s = bh1;
+ *bh_t = bh2;
trace_ocfs2_double_lock_end(
(unsigned long long)oi1->ip_blkno,
@@ -4794,8 +4801,7 @@ int ocfs2_reflink_inodes_lock(struct inode *s_inode,
out_cl1:
ocfs2_inode_unlock(inode1, 1);
- brelse(*bh1);
- *bh1 = NULL;
+ brelse(bh1);
out_rw2:
ocfs2_rw_unlock(inode2, 1);
out_i2:
diff --git a/fs/open.c b/fs/open.c
index 0285ce7dbd51..a00350018a47 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -733,6 +733,12 @@ static int do_dentry_open(struct file *f,
return 0;
}
+ /* Any file opened for execve()/uselib() has to be a regular file. */
+ if (unlikely(f->f_flags & FMODE_EXEC && !S_ISREG(inode->i_mode))) {
+ error = -EACCES;
+ goto cleanup_file;
+ }
+
if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
error = get_write_access(inode);
if (unlikely(error))
@@ -1209,3 +1215,21 @@ int nonseekable_open(struct inode *inode, struct file *filp)
}
EXPORT_SYMBOL(nonseekable_open);
+
+/*
+ * stream_open is used by subsystems that want stream-like file descriptors.
+ * Such file descriptors are not seekable and don't have notion of position
+ * (file.f_pos is always 0). Contrary to file descriptors of other regular
+ * files, .read() and .write() can run simultaneously.
+ *
+ * stream_open never fails and is marked to return int so that it could be
+ * directly used as file_operations.open .
+ */
+int stream_open(struct inode *inode, struct file *filp)
+{
+ filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS);
+ filp->f_mode |= FMODE_STREAM;
+ return 0;
+}
+
+EXPORT_SYMBOL(stream_open);
diff --git a/fs/pipe.c b/fs/pipe.c
index 070aad543382..41065901106b 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -188,9 +188,9 @@ EXPORT_SYMBOL(generic_pipe_buf_steal);
* in the tee() system call, when we duplicate the buffers in one
* pipe into another.
*/
-void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
+bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
{
- get_page(buf->page);
+ return try_get_page(buf->page);
}
EXPORT_SYMBOL(generic_pipe_buf_get);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index f5ebdd87afb2..6a803a0b75df 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -616,24 +616,25 @@ static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns,
static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
- long nr;
- unsigned long args[6], sp, pc;
+ struct syscall_info info;
+ u64 *args = &info.data.args[0];
int res;
res = lock_trace(task);
if (res)
return res;
- if (task_current_syscall(task, &nr, args, 6, &sp, &pc))
+ if (task_current_syscall(task, &info))
seq_puts(m, "running\n");
- else if (nr < 0)
- seq_printf(m, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
+ else if (info.data.nr < 0)
+ seq_printf(m, "%d 0x%llx 0x%llx\n",
+ info.data.nr, info.sp, info.data.instruction_pointer);
else
seq_printf(m,
- "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n",
- nr,
+ "%d 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx\n",
+ info.data.nr,
args[0], args[1], args[2], args[3], args[4], args[5],
- sp, pc);
+ info.sp, info.data.instruction_pointer);
unlock_trace(task);
return 0;
@@ -3074,6 +3075,15 @@ static const struct file_operations proc_tgid_base_operations = {
.llseek = generic_file_llseek,
};
+struct pid *tgid_pidfd_to_pid(const struct file *file)
+{
+ if (!d_is_dir(file->f_path.dentry) ||
+ (file->f_op != &proc_tgid_base_operations))
+ return ERR_PTR(-EBADF);
+
+ return proc_pid(file_inode(file));
+}
+
static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
return proc_pident_lookup(dir, dentry,
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index bbcc185062bb..f5834488b67d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -54,6 +54,28 @@ static LIST_HEAD(kclist_head);
static DECLARE_RWSEM(kclist_lock);
static int kcore_need_update = 1;
+/*
+ * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error
+ * Same as oldmem_pfn_is_ram in vmcore
+ */
+static int (*mem_pfn_is_ram)(unsigned long pfn);
+
+int __init register_mem_pfn_is_ram(int (*fn)(unsigned long pfn))
+{
+ if (mem_pfn_is_ram)
+ return -EBUSY;
+ mem_pfn_is_ram = fn;
+ return 0;
+}
+
+static int pfn_is_ram(unsigned long pfn)
+{
+ if (mem_pfn_is_ram)
+ return mem_pfn_is_ram(pfn);
+ else
+ return 1;
+}
+
/* This doesn't grab kclist_lock, so it should only be used at init time. */
void __init kclist_add(struct kcore_list *new, void *addr, size_t size,
int type)
@@ -465,6 +487,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
goto out;
}
m = NULL; /* skip the list anchor */
+ } else if (!pfn_is_ram(__pa(start) >> PAGE_SHIFT)) {
+ if (clear_user(buffer, tsz)) {
+ ret = -EFAULT;
+ goto out;
+ }
} else if (m->type == KCORE_VMALLOC) {
vread(buf, (char *)start, tsz);
/* we have to zero-fill user buffer even if no read */
@@ -588,7 +615,7 @@ static void __init proc_kcore_text_init(void)
/*
* MODULES_VADDR has no intersection with VMALLOC_ADDR.
*/
-struct kcore_list kcore_modules;
+static struct kcore_list kcore_modules;
static void __init add_modules_range(void)
{
if (MODULES_VADDR != VMALLOC_START && MODULES_END != VMALLOC_END) {
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 4d598a399bbf..d65390727541 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -1626,7 +1626,8 @@ static void drop_sysctl_table(struct ctl_table_header *header)
if (--header->nreg)
return;
- put_links(header);
+ if (parent)
+ put_links(header);
start_unregistering(header);
if (!--header->count)
kfree_rcu(header, rcu);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 92a91e7816d8..95ca1fe7283c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1143,6 +1143,24 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
count = -EINTR;
goto out_mm;
}
+ /*
+ * Avoid to modify vma->vm_flags
+ * without locked ops while the
+ * coredump reads the vm_flags.
+ */
+ if (!mmget_still_valid(mm)) {
+ /*
+ * Silently return "count"
+ * like if get_task_mm()
+ * failed. FIXME: should this
+ * function have returned
+ * -ESRCH if get_task_mm()
+ * failed like if
+ * get_proc_task() fails?
+ */
+ up_write(&mm->mmap_sem);
+ goto out_mm;
+ }
for (vma = mm->mmap; vma; vma = vma->vm_next) {
vma->vm_flags &= ~VM_SOFTDIRTY;
vma_set_page_prot(vma);
diff --git a/fs/read_write.c b/fs/read_write.c
index 177ccc3d405a..61b43ad7608e 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -560,12 +560,13 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
static inline loff_t file_pos_read(struct file *file)
{
- return file->f_pos;
+ return file->f_mode & FMODE_STREAM ? 0 : file->f_pos;
}
static inline void file_pos_write(struct file *file, loff_t pos)
{
- file->f_pos = pos;
+ if ((file->f_mode & FMODE_STREAM) == 0)
+ file->f_pos = pos;
}
ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
diff --git a/fs/splice.c b/fs/splice.c
index 3ee7e82df48f..98943d9b219c 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1593,7 +1593,11 @@ retry:
* Get a reference to this pipe buffer,
* so we can copy the contents over.
*/
- pipe_buf_get(ipipe, ibuf);
+ if (!pipe_buf_get(ipipe, ibuf)) {
+ if (ret == 0)
+ ret = -EFAULT;
+ break;
+ }
*obuf = *ibuf;
/*
@@ -1667,7 +1671,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
* Get a reference to this pipe buffer,
* so we can copy the contents over.
*/
- pipe_buf_get(ipipe, ibuf);
+ if (!pipe_buf_get(ipipe, ibuf)) {
+ if (ret == 0)
+ ret = -EFAULT;
+ break;
+ }
obuf = opipe->bufs + nbuf;
*obuf = *ibuf;
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 4cb21b558a85..1b56686ab178 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -71,9 +71,11 @@ static int sysfs_init_fs_context(struct fs_context *fc)
kfc->magic = SYSFS_MAGIC;
fc->fs_private = kfc;
fc->ops = &sysfs_fs_context_ops;
- if (fc->user_ns)
- put_user_ns(fc->user_ns);
- fc->user_ns = get_user_ns(netns->user_ns);
+ if (netns) {
+ if (fc->user_ns)
+ put_user_ns(fc->user_ns);
+ fc->user_ns = get_user_ns(netns->user_ns);
+ }
fc->global = true;
return 0;
}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 8dc2818fdd84..12628184772c 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -276,14 +276,12 @@ static void ubifs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
struct ubifs_inode *ui = ubifs_inode(inode);
+ kfree(ui->data);
kmem_cache_free(ubifs_inode_slab, ui);
}
static void ubifs_destroy_inode(struct inode *inode)
{
- struct ubifs_inode *ui = ubifs_inode(inode);
-
- kfree(ui->data);
call_rcu(&inode->i_rcu, ubifs_i_callback);
}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index ae796e10f68b..e7276932e433 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1242,8 +1242,10 @@ set_size:
truncate_setsize(inode, newsize);
down_write(&iinfo->i_data_sem);
udf_clear_extent_cache(inode);
- udf_truncate_extents(inode);
+ err = udf_truncate_extents(inode);
up_write(&iinfo->i_data_sem);
+ if (err)
+ return err;
}
update_time:
inode->i_mtime = inode->i_ctime = current_time(inode);
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index b647f0bd150c..63a47f1e1d52 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -199,7 +199,7 @@ static void udf_update_alloc_ext_desc(struct inode *inode,
* for making file shorter. For making file longer, udf_extend_file() has to
* be used.
*/
-void udf_truncate_extents(struct inode *inode)
+int udf_truncate_extents(struct inode *inode)
{
struct extent_position epos;
struct kernel_lb_addr eloc, neloc = {};
@@ -224,7 +224,7 @@ void udf_truncate_extents(struct inode *inode)
if (etype == -1) {
/* We should extend the file? */
WARN_ON(byte_offset);
- return;
+ return 0;
}
epos.offset -= adsize;
extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset);
@@ -260,6 +260,9 @@ void udf_truncate_extents(struct inode *inode)
epos.block = eloc;
epos.bh = udf_tread(sb,
udf_get_lb_pblock(sb, &eloc, 0));
+ /* Error reading indirect block? */
+ if (!epos.bh)
+ return -EIO;
if (elen)
indirect_ext_len =
(elen + sb->s_blocksize - 1) >>
@@ -283,4 +286,5 @@ void udf_truncate_extents(struct inode *inode)
iinfo->i_lenExtents = inode->i_size;
brelse(epos.bh);
+ return 0;
}
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index ee246769dee4..d89ef71887fc 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -235,7 +235,7 @@ extern struct inode *udf_new_inode(struct inode *, umode_t);
/* truncate.c */
extern void udf_truncate_tail_extent(struct inode *);
extern void udf_discard_prealloc(struct inode *);
-extern void udf_truncate_extents(struct inode *);
+extern int udf_truncate_extents(struct inode *);
/* balloc.c */
extern void udf_free_blocks(struct super_block *, struct inode *,
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 89800fc7dc9d..f5de1e726356 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -629,6 +629,8 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
/* the various vma->vm_userfaultfd_ctx still points to it */
down_write(&mm->mmap_sem);
+ /* no task can run (and in turn coredump) yet */
+ VM_WARN_ON(!mmget_still_valid(mm));
for (vma = mm->mmap; vma; vma = vma->vm_next)
if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
@@ -883,6 +885,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
* taking the mmap_sem for writing.
*/
down_write(&mm->mmap_sem);
+ if (!mmget_still_valid(mm))
+ goto skip_mm;
prev = NULL;
for (vma = mm->mmap; vma; vma = vma->vm_next) {
cond_resched();
@@ -905,6 +909,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
vma->vm_flags = new_flags;
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
}
+skip_mm:
up_write(&mm->mmap_sem);
mmput(mm);
wakeup:
@@ -1333,6 +1338,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
goto out;
down_write(&mm->mmap_sem);
+ if (!mmget_still_valid(mm))
+ goto out_unlock;
vma = find_vma_prev(mm, start, &prev);
if (!vma)
goto out_unlock;
@@ -1520,6 +1527,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
goto out;
down_write(&mm->mmap_sem);
+ if (!mmget_still_valid(mm))
+ goto out_unlock;
vma = find_vma_prev(mm, start, &prev);
if (!vma)
goto out_unlock;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 48502cb9990f..4637ae1ae91c 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -1191,7 +1191,10 @@ xfs_iread_extents(
* Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
*/
level = be16_to_cpu(block->bb_level);
- ASSERT(level > 0);
+ if (unlikely(level == 0)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ return -EFSCORRUPTED;
+ }
pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
bno = be64_to_cpu(*pp);
@@ -4249,9 +4252,13 @@ xfs_bmapi_write(
struct xfs_bmbt_irec *mval, /* output: map values */
int *nmap) /* i/o: mval size/count */
{
+ struct xfs_bmalloca bma = {
+ .tp = tp,
+ .ip = ip,
+ .total = total,
+ };
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp;
- struct xfs_bmalloca bma = { NULL }; /* args for xfs_bmap_alloc */
xfs_fileoff_t end; /* end of mapped file region */
bool eof = false; /* after the end of extents */
int error; /* error return */
@@ -4319,10 +4326,6 @@ xfs_bmapi_write(
eof = true;
if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev))
bma.prev.br_startoff = NULLFILEOFF;
- bma.tp = tp;
- bma.ip = ip;
- bma.total = total;
- bma.datatype = 0;
bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
n = 0;
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index 9a3767818c50..9c2a0a13ed61 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -563,43 +563,40 @@ xfs_dir3_leaf_find_entry(
*/
int /* error */
xfs_dir2_leaf_addname(
- xfs_da_args_t *args) /* operation arguments */
+ struct xfs_da_args *args) /* operation arguments */
{
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_trans *tp = args->trans;
__be16 *bestsp; /* freespace table in leaf */
- int compact; /* need to compact leaves */
- xfs_dir2_data_hdr_t *hdr; /* data block header */
+ __be16 *tagp; /* end of data entry */
struct xfs_buf *dbp; /* data block buffer */
- xfs_dir2_data_entry_t *dep; /* data block entry */
- xfs_inode_t *dp; /* incore directory inode */
- xfs_dir2_data_unused_t *dup; /* data unused entry */
+ struct xfs_buf *lbp; /* leaf's buffer */
+ struct xfs_dir2_leaf *leaf; /* leaf structure */
+ struct xfs_inode *dp = args->dp; /* incore directory inode */
+ struct xfs_dir2_data_hdr *hdr; /* data block header */
+ struct xfs_dir2_data_entry *dep; /* data block entry */
+ struct xfs_dir2_leaf_entry *lep; /* leaf entry table pointer */
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir2_data_unused *dup; /* data unused entry */
+ struct xfs_dir2_leaf_tail *ltp; /* leaf tail pointer */
+ struct xfs_dir2_data_free *bf; /* bestfree table */
+ int compact; /* need to compact leaves */
int error; /* error return value */
int grown; /* allocated new data block */
- int highstale; /* index of next stale leaf */
+ int highstale = 0; /* index of next stale leaf */
int i; /* temporary, index */
int index; /* leaf table position */
- struct xfs_buf *lbp; /* leaf's buffer */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
int length; /* length of new entry */
- xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */
int lfloglow; /* low leaf logging index */
int lfloghigh; /* high leaf logging index */
- int lowstale; /* index of prev stale leaf */
- xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */
+ int lowstale = 0; /* index of prev stale leaf */
int needbytes; /* leaf block bytes needed */
int needlog; /* need to log data header */
int needscan; /* need to rescan data free */
- __be16 *tagp; /* end of data entry */
- xfs_trans_t *tp; /* transaction pointer */
xfs_dir2_db_t use_block; /* data block number */
- struct xfs_dir2_data_free *bf; /* bestfree table */
- struct xfs_dir2_leaf_entry *ents;
- struct xfs_dir3_icleaf_hdr leafhdr;
trace_xfs_dir2_leaf_addname(args);
- dp = args->dp;
- tp = args->trans;
-
error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index 3b03703c5c3d..16731d2d684b 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -426,24 +426,22 @@ xfs_dir2_leaf_to_node(
static int /* error */
xfs_dir2_leafn_add(
struct xfs_buf *bp, /* leaf buffer */
- xfs_da_args_t *args, /* operation arguments */
+ struct xfs_da_args *args, /* operation arguments */
int index) /* insertion pt for new entry */
{
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
+ struct xfs_dir2_leaf_entry *lep;
+ struct xfs_dir2_leaf_entry *ents;
int compact; /* compacting stale leaves */
- xfs_inode_t *dp; /* incore directory inode */
- int highstale; /* next stale entry */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
- xfs_dir2_leaf_entry_t *lep; /* leaf entry */
+ int highstale = 0; /* next stale entry */
int lfloghigh; /* high leaf entry logging */
int lfloglow; /* low leaf entry logging */
- int lowstale; /* previous stale entry */
- struct xfs_dir3_icleaf_hdr leafhdr;
- struct xfs_dir2_leaf_entry *ents;
+ int lowstale = 0; /* previous stale entry */
trace_xfs_dir2_leafn_add(args, index);
- dp = args->dp;
- leaf = bp->b_addr;
dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
ents = dp->d_ops->leaf_ents_p(leaf);
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index 6f94d1f7322d..117910db51b8 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -415,8 +415,17 @@ xchk_btree_check_owner(
struct xfs_btree_cur *cur = bs->cur;
struct check_owner *co;
- if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && bp == NULL)
+ /*
+ * In theory, xfs_btree_get_block should only give us a null buffer
+ * pointer for the root of a root-in-inode btree type, but we need
+ * to check defensively here in case the cursor state is also screwed
+ * up.
+ */
+ if (bp == NULL) {
+ if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE))
+ xchk_btree_set_corrupt(bs->sc, bs->cur, level);
return 0;
+ }
/*
* We want to cross-reference each btree block with the bnobt
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index f1260b4bfdee..90527b094878 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -574,6 +574,11 @@ xchk_da_btree(
/* Drill another level deeper. */
blkno = be32_to_cpu(key->before);
level++;
+ if (level >= XFS_DA_NODE_MAXDEPTH) {
+ /* Too deep! */
+ xchk_da_set_corrupt(&ds, level - 1);
+ break;
+ }
ds.tree_level--;
error = xchk_da_btree_block(&ds, level, blkno);
if (error)
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 93f07edafd81..9ee2a7d02e70 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -161,6 +161,14 @@ xfs_ioc_trim(
return -EPERM;
if (!blk_queue_discard(q))
return -EOPNOTSUPP;
+
+ /*
+ * We haven't recovered the log, so we cannot use our bnobt-guided
+ * storage zapping commands.
+ */
+ if (mp->m_flags & XFS_MOUNT_NORECOVERY)
+ return -EROFS;
+
if (copy_from_user(&range, urange, sizeof(range)))
return -EFAULT;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1f2e2845eb76..a7ceae90110e 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -529,18 +529,17 @@ xfs_file_dio_aio_write(
count = iov_iter_count(from);
/*
- * If we are doing unaligned IO, wait for all other IO to drain,
- * otherwise demote the lock if we had to take the exclusive lock
- * for other reasons in xfs_file_aio_write_checks.
+ * If we are doing unaligned IO, we can't allow any other overlapping IO
+ * in-flight at the same time or we risk data corruption. Wait for all
+ * other IO to drain before we submit. If the IO is aligned, demote the
+ * iolock if we had to take the exclusive lock in
+ * xfs_file_aio_write_checks() for other reasons.
*/
if (unaligned_io) {
- /* If we are going to wait for other DIO to finish, bail */
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (atomic_read(&inode->i_dio_count))
- return -EAGAIN;
- } else {
- inode_dio_wait(inode);
- }
+ /* unaligned dio always waits, bail */
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+ inode_dio_wait(inode);
} else if (iolock == XFS_IOLOCK_EXCL) {
xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
iolock = XFS_IOLOCK_SHARED;
@@ -548,6 +547,14 @@ xfs_file_dio_aio_write(
trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io);
+
+ /*
+ * If unaligned, this is the only IO in-flight. If it has not yet
+ * completed, wait on it before we release the iolock to prevent
+ * subsequent overlapping IO.
+ */
+ if (ret == -EIOCBQUEUED && unaligned_io)
+ inode_dio_wait(inode);
out:
xfs_iunlock(ip, iolock);