summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_file.c4
-rw-r--r--fs/9p/vfs_super.c3
-rw-r--r--fs/Makefile2
-rw-r--r--fs/adfs/super.c3
-rw-r--r--fs/affs/super.c3
-rw-r--r--fs/befs/linuxvfs.c3
-rw-r--r--fs/bfs/inode.c3
-rw-r--r--fs/ceph/addr.c416
-rw-r--r--fs/ceph/caps.c128
-rw-r--r--fs/ceph/debugfs.c18
-rw-r--r--fs/ceph/dir.c20
-rw-r--r--fs/ceph/file.c85
-rw-r--r--fs/ceph/inode.c10
-rw-r--r--fs/ceph/locks.c2
-rw-r--r--fs/ceph/mds_client.c109
-rw-r--r--fs/ceph/mds_client.h2
-rw-r--r--fs/ceph/metric.c14
-rw-r--r--fs/ceph/metric.h7
-rw-r--r--fs/ceph/quota.c10
-rw-r--r--fs/ceph/snap.c2
-rw-r--r--fs/ceph/super.c11
-rw-r--r--fs/ceph/super.h13
-rw-r--r--fs/ceph/xattr.c3
-rw-r--r--fs/cifs/Makefile2
-rw-r--r--fs/cifs/asn1.c16
-rw-r--r--fs/cifs/cifs_unicode.c8
-rw-r--r--fs/cifs/cifsacl.c5
-rw-r--r--fs/cifs/cifsfs.c8
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h20
-rw-r--r--fs/cifs/cifsproto.h2
-rw-r--r--fs/cifs/connect.c264
-rw-r--r--fs/cifs/fs_context.c221
-rw-r--r--fs/cifs/fs_context.h58
-rw-r--r--fs/cifs/inode.c57
-rw-r--r--fs/cifs/readdir.c60
-rw-r--r--fs/cifs/smb2glob.h1
-rw-r--r--fs/cifs/smb2inode.c23
-rw-r--r--fs/cifs/smb2maperror.c4
-rw-r--r--fs/cifs/smb2ops.c213
-rw-r--r--fs/cifs/smb2pdu.c53
-rw-r--r--fs/cifs/smb2pdu.h121
-rw-r--r--fs/cifs/smb2proto.h6
-rw-r--r--fs/cifs/smb2transport.c8
-rw-r--r--fs/cifs/smbfsctl.h2
-rw-r--r--fs/cifs/trace.h18
-rw-r--r--fs/cifs/transport.c5
-rw-r--r--fs/cramfs/inode.c3
-rw-r--r--fs/dax.c29
-rw-r--r--fs/efs/super.c3
-rw-r--r--fs/erofs/super.c3
-rw-r--r--fs/exfat/dir.c29
-rw-r--r--fs/exfat/exfat_fs.h4
-rw-r--r--fs/exfat/file.c4
-rw-r--r--fs/exfat/inode.c5
-rw-r--r--fs/exfat/namei.c153
-rw-r--r--fs/exfat/nls.c2
-rw-r--r--fs/exfat/super.c4
-rw-r--r--fs/ext2/super.c3
-rw-r--r--fs/ext4/Makefile2
-rw-r--r--fs/ext4/acl.c2
-rw-r--r--fs/ext4/balloc.c14
-rw-r--r--fs/ext4/block_validity.c10
-rw-r--r--fs/ext4/dir.c4
-rw-r--r--fs/ext4/ext4.h136
-rw-r--r--fs/ext4/ext4_jbd2.c2
-rw-r--r--fs/ext4/extents.c315
-rw-r--r--fs/ext4/extents_status.c24
-rw-r--r--fs/ext4/fast_commit.c2139
-rw-r--r--fs/ext4/fast_commit.h159
-rw-r--r--fs/ext4/file.c12
-rw-r--r--fs/ext4/fsmap.c11
-rw-r--r--fs/ext4/fsync.c4
-rw-r--r--fs/ext4/ialloc.c173
-rw-r--r--fs/ext4/indirect.c13
-rw-r--r--fs/ext4/inline.c2
-rw-r--r--fs/ext4/inode.c290
-rw-r--r--fs/ext4/ioctl.c22
-rw-r--r--fs/ext4/mballoc.c257
-rw-r--r--fs/ext4/mmp.c10
-rw-r--r--fs/ext4/move_extent.c2
-rw-r--r--fs/ext4/namei.c206
-rw-r--r--fs/ext4/resize.c14
-rw-r--r--fs/ext4/super.c355
-rw-r--r--fs/ext4/sysfs.c2
-rw-r--r--fs/ext4/xattr.c3
-rw-r--r--fs/f2fs/super.c3
-rw-r--r--fs/fat/inode.c3
-rw-r--r--fs/file_table.c2
-rw-r--r--fs/fuse/Kconfig16
-rw-r--r--fs/fuse/Makefile6
-rw-r--r--fs/fuse/control.c20
-rw-r--r--fs/fuse/cuse.c21
-rw-r--r--fs/fuse/dax.c1365
-rw-r--r--fs/fuse/dev.c189
-rw-r--r--fs/fuse/dir.c220
-rw-r--r--fs/fuse/file.c255
-rw-r--r--fs/fuse/fuse_i.h185
-rw-r--r--fs/fuse/inode.c391
-rw-r--r--fs/fuse/readdir.c10
-rw-r--r--fs/fuse/virtio_fs.c378
-rw-r--r--fs/fuse/xattr.c34
-rw-r--r--fs/gfs2/aops.c68
-rw-r--r--fs/gfs2/bmap.c62
-rw-r--r--fs/gfs2/bmap.h1
-rw-r--r--fs/gfs2/glock.c52
-rw-r--r--fs/gfs2/glops.c36
-rw-r--r--fs/gfs2/incore.h29
-rw-r--r--fs/gfs2/log.c89
-rw-r--r--fs/gfs2/log.h2
-rw-r--r--fs/gfs2/lops.c2
-rw-r--r--fs/gfs2/lops.h1
-rw-r--r--fs/gfs2/meta_io.c81
-rw-r--r--fs/gfs2/meta_io.h2
-rw-r--r--fs/gfs2/ops_fstype.c173
-rw-r--r--fs/gfs2/recovery.c108
-rw-r--r--fs/gfs2/rgrp.c19
-rw-r--r--fs/gfs2/rgrp.h2
-rw-r--r--fs/gfs2/super.c220
-rw-r--r--fs/gfs2/super.h5
-rw-r--r--fs/gfs2/sys.c5
-rw-r--r--fs/gfs2/trace_gfs2.h7
-rw-r--r--fs/gfs2/util.c2
-rw-r--r--fs/gfs2/util.h10
-rw-r--r--fs/hfs/super.c3
-rw-r--r--fs/hfsplus/super.c3
-rw-r--r--fs/hpfs/super.c3
-rw-r--r--fs/io-wq.c117
-rw-r--r--fs/io-wq.h19
-rw-r--r--fs/io_uring.c804
-rw-r--r--fs/isofs/inode.c3
-rw-r--r--fs/jbd2/commit.c106
-rw-r--r--fs/jbd2/journal.c245
-rw-r--r--fs/jbd2/recovery.c135
-rw-r--r--fs/lockd/mon.c2
-rw-r--r--fs/lockd/svc4proc.c248
-rw-r--r--fs/lockd/svcproc.c250
-rw-r--r--fs/minix/inode.c3
-rw-r--r--fs/namei.c3
-rw-r--r--fs/namespace.c4
-rw-r--r--fs/nfs/fs_context.c1
-rw-r--r--fs/nfs/namespace.c12
-rw-r--r--fs/nfs/nfs42xattr.c5
-rw-r--r--fs/nfs/nfs42xdr.c167
-rw-r--r--fs/nfs/nfs4_fs.h8
-rw-r--r--fs/nfs/nfs4client.c2
-rw-r--r--fs/nfs/nfs4file.c41
-rw-r--r--fs/nfs/nfs4idmap.c15
-rw-r--r--fs/nfs/nfs4proc.c272
-rw-r--r--fs/nfs/nfs4super.c5
-rw-r--r--fs/nfs/nfs4trace.h1
-rw-r--r--fs/nfs/nfs4xdr.c7
-rw-r--r--fs/nfs/pnfs.c2
-rw-r--r--fs/nfs/super.c19
-rw-r--r--fs/nfs/sysfs.c11
-rw-r--r--fs/nfs/sysfs.h2
-rw-r--r--fs/nfs_common/Makefile1
-rw-r--r--fs/nfs_common/nfs_ssc.c94
-rw-r--r--fs/nfsd/Kconfig12
-rw-r--r--fs/nfsd/Makefile1
-rw-r--r--fs/nfsd/export.c2
-rw-r--r--fs/nfsd/filecache.c2
-rw-r--r--fs/nfsd/nfs2acl.c160
-rw-r--r--fs/nfsd/nfs3acl.c88
-rw-r--r--fs/nfsd/nfs3proc.c238
-rw-r--r--fs/nfsd/nfs3xdr.c25
-rw-r--r--fs/nfsd/nfs4proc.c34
-rw-r--r--fs/nfsd/nfs4state.c605
-rw-r--r--fs/nfsd/nfs4xdr.c202
-rw-r--r--fs/nfsd/nfscache.c12
-rw-r--r--fs/nfsd/nfsctl.c3
-rw-r--r--fs/nfsd/nfsproc.c283
-rw-r--r--fs/nfsd/nfssvc.c122
-rw-r--r--fs/nfsd/nfsxdr.c52
-rw-r--r--fs/nfsd/state.h27
-rw-r--r--fs/nfsd/trace.h4
-rw-r--r--fs/nfsd/vfs.c6
-rw-r--r--fs/nfsd/xdr.h16
-rw-r--r--fs/nfsd/xdr3.h1
-rw-r--r--fs/nfsd/xdr4.h1
-rw-r--r--fs/nilfs2/super.c3
-rw-r--r--fs/ntfs/super.c3
-rw-r--r--fs/ocfs2/journal.c4
-rw-r--r--fs/omfs/inode.c6
-rw-r--r--fs/proc/base.c4
-rw-r--r--fs/proc/inode.c119
-rw-r--r--fs/proc/proc_sysctl.c48
-rw-r--r--fs/proc_namespace.c1
-rw-r--r--fs/qnx4/inode.c3
-rw-r--r--fs/qnx6/inode.c3
-rw-r--r--fs/read_write.c615
-rw-r--r--fs/remap_range.c571
-rw-r--r--fs/romfs/super.c3
-rw-r--r--fs/splice.c193
-rw-r--r--fs/squashfs/super.c3
-rw-r--r--fs/stat.c70
-rw-r--r--fs/statfs.c2
-rw-r--r--fs/sysv/inode.c3
-rw-r--r--fs/udf/super.c3
-rw-r--r--fs/ufs/super.c3
-rw-r--r--fs/xfs/Kconfig25
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c2
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c19
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h18
-rw-r--r--fs/xfs/libxfs/xfs_defer.c232
-rw-r--r--fs/xfs/libxfs/xfs_defer.h37
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.h2
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c27
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c11
-rw-r--r--fs/xfs/scrub/dabtree.c14
-rw-r--r--fs/xfs/xfs_bmap_item.c132
-rw-r--r--fs/xfs/xfs_bmap_util.c18
-rw-r--r--fs/xfs/xfs_buf_item_recover.c2
-rw-r--r--fs/xfs/xfs_dquot.c4
-rw-r--r--fs/xfs/xfs_extfree_item.c44
-rw-r--r--fs/xfs/xfs_file.c40
-rw-r--r--fs/xfs/xfs_filestream.c34
-rw-r--r--fs/xfs/xfs_fsmap.c48
-rw-r--r--fs/xfs/xfs_fsmap.h6
-rw-r--r--fs/xfs/xfs_inode.c123
-rw-r--r--fs/xfs/xfs_ioctl.c144
-rw-r--r--fs/xfs/xfs_iops.c4
-rw-r--r--fs/xfs/xfs_linux.h7
-rw-r--r--fs/xfs/xfs_log.c44
-rw-r--r--fs/xfs/xfs_log.h2
-rw-r--r--fs/xfs/xfs_log_recover.c229
-rw-r--r--fs/xfs/xfs_qm.c16
-rw-r--r--fs/xfs/xfs_refcount_item.c51
-rw-r--r--fs/xfs/xfs_rmap_item.c42
-rw-r--r--fs/xfs/xfs_rtalloc.c31
-rw-r--r--fs/xfs/xfs_stats.c4
-rw-r--r--fs/xfs/xfs_stats.h1
-rw-r--r--fs/xfs/xfs_super.c47
-rw-r--r--fs/xfs/xfs_sysctl.c36
-rw-r--r--fs/xfs/xfs_trace.h1
-rw-r--r--fs/xfs/xfs_trans.c2
-rw-r--r--fs/xfs/xfs_trans.h33
-rw-r--r--fs/xfs/xfs_trans_dquot.c43
-rw-r--r--fs/zonefs/super.c224
-rw-r--r--fs/zonefs/zonefs.h10
240 files changed, 13448 insertions, 5077 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 6ecf863bfa2f..b177fd3b1eb3 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -612,9 +612,9 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
struct writeback_control wbc = {
.nr_to_write = LONG_MAX,
.sync_mode = WB_SYNC_ALL,
- .range_start = vma->vm_pgoff * PAGE_SIZE,
+ .range_start = (loff_t)vma->vm_pgoff * PAGE_SIZE,
/* absolute end, byte at end included */
- .range_end = vma->vm_pgoff * PAGE_SIZE +
+ .range_end = (loff_t)vma->vm_pgoff * PAGE_SIZE +
(vma->vm_end - vma->vm_start - 1),
};
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index e34fa20acf61..9a21269b7234 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -260,8 +260,7 @@ static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail = rs.bavail;
buf->f_files = rs.files;
buf->f_ffree = rs.ffree;
- buf->f_fsid.val[0] = rs.fsid & 0xFFFFFFFFUL;
- buf->f_fsid.val[1] = (rs.fsid >> 32) & 0xFFFFFFFFUL;
+ buf->f_fsid = u64_to_fsid(rs.fsid);
buf->f_namelen = rs.namelen;
}
if (res != -ENOSYS)
diff --git a/fs/Makefile b/fs/Makefile
index 7bb2a05fda1f..999d1a23f036 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -14,7 +14,7 @@ obj-y := open.o read_write.o file_table.o super.o \
pnode.o splice.o sync.o utimes.o d_path.o \
stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
- kernel_read_file.o
+ kernel_read_file.o remap_range.o
ifeq ($(CONFIG_BLOCK),y)
obj-y += buffer.o block_dev.o direct-io.o mpage.o
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index d553bb5bc17a..bdbd26e571ed 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -210,8 +210,7 @@ static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_namelen = sbi->s_namelen;
buf->f_bsize = sb->s_blocksize;
buf->f_ffree = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks;
- buf->f_fsid.val[0] = (u32)id;
- buf->f_fsid.val[1] = (u32)(id >> 32);
+ buf->f_fsid = u64_to_fsid(id);
return 0;
}
diff --git a/fs/affs/super.c b/fs/affs/super.c
index a100cd9950c8..c6c2a513ec92 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -620,8 +620,7 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_blocks = AFFS_SB(sb)->s_partition_size - AFFS_SB(sb)->s_reserved;
buf->f_bfree = free;
buf->f_bavail = free;
- buf->f_fsid.val[0] = (u32)id;
- buf->f_fsid.val[1] = (u32)(id >> 32);
+ buf->f_fsid = u64_to_fsid(id);
buf->f_namelen = AFFSNAMEMAX;
return 0;
}
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 2482032021ca..c1ba13d19024 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -963,8 +963,7 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail = buf->f_bfree;
buf->f_files = 0; /* UNKNOWN */
buf->f_ffree = 0; /* UNKNOWN */
- buf->f_fsid.val[0] = (u32)id;
- buf->f_fsid.val[1] = (u32)(id >> 32);
+ buf->f_fsid = u64_to_fsid(id);
buf->f_namelen = BEFS_NAME_LEN;
befs_debug(sb, "<--- %s", __func__);
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index f8ce1368218b..3ac7611ef7ce 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -229,8 +229,7 @@ static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bfree = buf->f_bavail = info->si_freeb;
buf->f_files = info->si_lasti + 1 - BFS_ROOT_INO;
buf->f_ffree = info->si_freei;
- buf->f_fsid.val[0] = (u32)id;
- buf->f_fsid.val[1] = (u32)(id >> 32);
+ buf->f_fsid = u64_to_fsid(id);
buf->f_namelen = BFS_NAMELEN;
return 0;
}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 6ea761c84494..35c83f65475b 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -182,58 +182,15 @@ static int ceph_releasepage(struct page *page, gfp_t g)
return !PagePrivate(page);
}
-/*
- * Read some contiguous pages. If we cross a stripe boundary, shorten
- * *plen. Return number of bytes read, or error.
- */
-static int ceph_sync_readpages(struct ceph_fs_client *fsc,
- struct ceph_vino vino,
- struct ceph_file_layout *layout,
- u64 off, u64 *plen,
- u32 truncate_seq, u64 truncate_size,
- struct page **pages, int num_pages,
- int page_align)
-{
- struct ceph_osd_client *osdc = &fsc->client->osdc;
- struct ceph_osd_request *req;
- int rc = 0;
-
- dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
- vino.snap, off, *plen);
- req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1,
- CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
- NULL, truncate_seq, truncate_size,
- false);
- if (IS_ERR(req))
- return PTR_ERR(req);
-
- /* it may be a short read due to an object boundary */
- osd_req_op_extent_osd_data_pages(req, 0,
- pages, *plen, page_align, false, false);
-
- dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n",
- off, *plen, *plen, page_align);
-
- rc = ceph_osdc_start_request(osdc, req, false);
- if (!rc)
- rc = ceph_osdc_wait_request(osdc, req);
-
- ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency,
- req->r_end_latency, rc);
-
- ceph_osdc_put_request(req);
- dout("readpages result %d\n", rc);
- return rc;
-}
-
-/*
- * read a single page, without unlocking it.
- */
+/* read a single page, without unlocking it. */
static int ceph_do_readpage(struct file *filp, struct page *page)
{
struct inode *inode = file_inode(filp);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
+ struct ceph_osd_request *req;
+ struct ceph_vino vino = ceph_vino(inode);
int err = 0;
u64 off = page_offset(page);
u64 len = PAGE_SIZE;
@@ -260,19 +217,33 @@ static int ceph_do_readpage(struct file *filp, struct page *page)
if (err == 0)
return -EINPROGRESS;
- dout("readpage inode %p file %p page %p index %lu\n",
- inode, filp, page, page->index);
- err = ceph_sync_readpages(fsc, ceph_vino(inode),
- &ci->i_layout, off, &len,
- ci->i_truncate_seq, ci->i_truncate_size,
- &page, 1, 0);
+ dout("readpage ino %llx.%llx file %p off %llu len %llu page %p index %lu\n",
+ vino.ino, vino.snap, filp, off, len, page, page->index);
+ req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, 0, 1,
+ CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL,
+ ci->i_truncate_seq, ci->i_truncate_size,
+ false);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false);
+
+ err = ceph_osdc_start_request(osdc, req, false);
+ if (!err)
+ err = ceph_osdc_wait_request(osdc, req);
+
+ ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency,
+ req->r_end_latency, err);
+
+ ceph_osdc_put_request(req);
+ dout("readpage result %d\n", err);
+
if (err == -ENOENT)
err = 0;
if (err < 0) {
- SetPageError(page);
ceph_fscache_readpage_cancel(inode, page);
- if (err == -EBLACKLISTED)
- fsc->blacklisted = true;
+ if (err == -EBLOCKLISTED)
+ fsc->blocklisted = true;
goto out;
}
if (err < PAGE_SIZE)
@@ -312,8 +283,8 @@ static void finish_read(struct ceph_osd_request *req)
int i;
dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
- if (rc == -EBLACKLISTED)
- ceph_inode_to_client(inode)->blacklisted = true;
+ if (rc == -EBLOCKLISTED)
+ ceph_inode_to_client(inode)->blocklisted = true;
/* unlock all pages, zeroing any data we didn't read */
osd_data = osd_req_op_extent_osd_data(req, 0);
@@ -620,50 +591,6 @@ static u64 get_writepages_data_length(struct inode *inode,
}
/*
- * do a synchronous write on N pages
- */
-static int ceph_sync_writepages(struct ceph_fs_client *fsc,
- struct ceph_vino vino,
- struct ceph_file_layout *layout,
- struct ceph_snap_context *snapc,
- u64 off, u64 len,
- u32 truncate_seq, u64 truncate_size,
- struct timespec64 *mtime,
- struct page **pages, int num_pages)
-{
- struct ceph_osd_client *osdc = &fsc->client->osdc;
- struct ceph_osd_request *req;
- int rc = 0;
- int page_align = off & ~PAGE_MASK;
-
- req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
- CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
- snapc, truncate_seq, truncate_size,
- true);
- if (IS_ERR(req))
- return PTR_ERR(req);
-
- /* it may be a short write due to an object boundary */
- osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
- false, false);
- dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
-
- req->r_mtime = *mtime;
- rc = ceph_osdc_start_request(osdc, req, true);
- if (!rc)
- rc = ceph_osdc_wait_request(osdc, req);
-
- ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency,
- req->r_end_latency, rc);
-
- ceph_osdc_put_request(req);
- if (rc == 0)
- rc = len;
- dout("writepages result %d\n", rc);
- return rc;
-}
-
-/*
* Write a single page, but leave the page locked.
*
* If we get a write error, mark the mapping for error, but still adjust the
@@ -671,20 +598,19 @@ static int ceph_sync_writepages(struct ceph_fs_client *fsc,
*/
static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
{
- struct inode *inode;
- struct ceph_inode_info *ci;
- struct ceph_fs_client *fsc;
+ struct inode *inode = page->mapping->host;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_snap_context *snapc, *oldest;
loff_t page_off = page_offset(page);
- int err, len = PAGE_SIZE;
+ int err;
+ loff_t len = PAGE_SIZE;
struct ceph_writeback_ctl ceph_wbc;
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
+ struct ceph_osd_request *req;
dout("writepage %p idx %lu\n", page, page->index);
- inode = page->mapping->host;
- ci = ceph_inode(inode);
- fsc = ceph_inode_to_client(inode);
-
/* verify this is a writeable snap context */
snapc = page_snap_context(page);
if (!snapc) {
@@ -713,7 +639,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
if (ceph_wbc.i_size < page_off + len)
len = ceph_wbc.i_size - page_off;
- dout("writepage %p page %p index %lu on %llu~%u snapc %p seq %lld\n",
+ dout("writepage %p page %p index %lu on %llu~%llu snapc %p seq %lld\n",
inode, page, page->index, page_off, len, snapc, snapc->seq);
if (atomic_long_inc_return(&fsc->writeback_count) >
@@ -721,11 +647,33 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
set_page_writeback(page);
- err = ceph_sync_writepages(fsc, ceph_vino(inode),
- &ci->i_layout, snapc, page_off, len,
- ceph_wbc.truncate_seq,
- ceph_wbc.truncate_size,
- &inode->i_mtime, &page, 1);
+ req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1,
+ CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc,
+ ceph_wbc.truncate_seq, ceph_wbc.truncate_size,
+ true);
+ if (IS_ERR(req)) {
+ redirty_page_for_writepage(wbc, page);
+ end_page_writeback(page);
+ return PTR_ERR(req);
+ }
+
+ /* it may be a short write due to an object boundary */
+ WARN_ON_ONCE(len > PAGE_SIZE);
+ osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false);
+ dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len);
+
+ req->r_mtime = inode->i_mtime;
+ err = ceph_osdc_start_request(osdc, req, true);
+ if (!err)
+ err = ceph_osdc_wait_request(osdc, req);
+
+ ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency,
+ req->r_end_latency, err);
+
+ ceph_osdc_put_request(req);
+ if (err == 0)
+ err = len;
+
if (err < 0) {
struct writeback_control tmp_wbc;
if (!wbc)
@@ -737,8 +685,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
end_page_writeback(page);
return err;
}
- if (err == -EBLACKLISTED)
- fsc->blacklisted = true;
+ if (err == -EBLOCKLISTED)
+ fsc->blocklisted = true;
dout("writepage setting page/mapping error %d %p\n",
err, page);
mapping_set_error(&inode->i_data, err);
@@ -801,8 +749,8 @@ static void writepages_finish(struct ceph_osd_request *req)
if (rc < 0) {
mapping_set_error(mapping, rc);
ceph_set_error_write(ci);
- if (rc == -EBLACKLISTED)
- fsc->blacklisted = true;
+ if (rc == -EBLOCKLISTED)
+ fsc->blocklisted = true;
} else {
ceph_clear_error_write(ci);
}
@@ -962,9 +910,8 @@ retry:
max_pages = wsize >> PAGE_SHIFT;
get_more_pages:
- pvec_pages = pagevec_lookup_range_nr_tag(&pvec, mapping, &index,
- end, PAGECACHE_TAG_DIRTY,
- max_pages - locked_pages);
+ pvec_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
+ end, PAGECACHE_TAG_DIRTY);
dout("pagevec_lookup_range_tag got %d\n", pvec_pages);
if (!pvec_pages && !locked_pages)
break;
@@ -1299,110 +1246,60 @@ static int context_is_writeable_or_written(struct inode *inode,
return ret;
}
-/*
- * We are only allowed to write into/dirty the page if the page is
- * clean, or already dirty within the same snap context.
+/**
+ * ceph_find_incompatible - find an incompatible context and return it
+ * @page: page being dirtied
+ *
+ * We are only allowed to write into/dirty a page if the page is
+ * clean, or already dirty within the same snap context. Returns a
+ * conflicting context if there is one, NULL if there isn't, or a
+ * negative error code on other errors.
*
- * called with page locked.
- * return success with page locked,
- * or any failure (incl -EAGAIN) with page unlocked.
+ * Must be called with page lock held.
*/
-static int ceph_update_writeable_page(struct file *file,
- loff_t pos, unsigned len,
- struct page *page)
+static struct ceph_snap_context *
+ceph_find_incompatible(struct page *page)
{
- struct inode *inode = file_inode(file);
+ struct inode *inode = page->mapping->host;
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
- loff_t page_off = pos & PAGE_MASK;
- int pos_in_page = pos & ~PAGE_MASK;
- int end_in_page = pos_in_page + len;
- loff_t i_size;
- int r;
- struct ceph_snap_context *snapc, *oldest;
if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
dout(" page %p forced umount\n", page);
- unlock_page(page);
- return -EIO;
+ return ERR_PTR(-EIO);
}
-retry_locked:
- /* writepages currently holds page lock, but if we change that later, */
- wait_on_page_writeback(page);
+ for (;;) {
+ struct ceph_snap_context *snapc, *oldest;
+
+ wait_on_page_writeback(page);
+
+ snapc = page_snap_context(page);
+ if (!snapc || snapc == ci->i_head_snapc)
+ break;
- snapc = page_snap_context(page);
- if (snapc && snapc != ci->i_head_snapc) {
/*
* this page is already dirty in another (older) snap
* context! is it writeable now?
*/
oldest = get_oldest_context(inode, NULL, NULL);
if (snapc->seq > oldest->seq) {
+ /* not writeable -- return it for the caller to deal with */
ceph_put_snap_context(oldest);
- dout(" page %p snapc %p not current or oldest\n",
- page, snapc);
- /*
- * queue for writeback, and wait for snapc to
- * be writeable or written
- */
- snapc = ceph_get_snap_context(snapc);
- unlock_page(page);
- ceph_queue_writeback(inode);
- r = wait_event_killable(ci->i_cap_wq,
- context_is_writeable_or_written(inode, snapc));
- ceph_put_snap_context(snapc);
- if (r == -ERESTARTSYS)
- return r;
- return -EAGAIN;
+ dout(" page %p snapc %p not current or oldest\n", page, snapc);
+ return ceph_get_snap_context(snapc);
}
ceph_put_snap_context(oldest);
/* yay, writeable, do it now (without dropping page lock) */
- dout(" page %p snapc %p not current, but oldest\n",
- page, snapc);
- if (!clear_page_dirty_for_io(page))
- goto retry_locked;
- r = writepage_nounlock(page, NULL);
- if (r < 0)
- goto fail_unlock;
- goto retry_locked;
- }
-
- if (PageUptodate(page)) {
- dout(" page %p already uptodate\n", page);
- return 0;
- }
-
- /* full page? */
- if (pos_in_page == 0 && len == PAGE_SIZE)
- return 0;
-
- /* past end of file? */
- i_size = i_size_read(inode);
-
- if (page_off >= i_size ||
- (pos_in_page == 0 && (pos+len) >= i_size &&
- end_in_page - pos_in_page != PAGE_SIZE)) {
- dout(" zeroing %p 0 - %d and %d - %d\n",
- page, pos_in_page, end_in_page, (int)PAGE_SIZE);
- zero_user_segments(page,
- 0, pos_in_page,
- end_in_page, PAGE_SIZE);
- return 0;
- }
-
- /* we need to read it. */
- r = ceph_do_readpage(file, page);
- if (r < 0) {
- if (r == -EINPROGRESS)
- return -EAGAIN;
- goto fail_unlock;
+ dout(" page %p snapc %p not current, but oldest\n", page, snapc);
+ if (clear_page_dirty_for_io(page)) {
+ int r = writepage_nounlock(page, NULL);
+ if (r < 0)
+ return ERR_PTR(r);
+ }
}
- goto retry_locked;
-fail_unlock:
- unlock_page(page);
- return r;
+ return NULL;
}
/*
@@ -1414,26 +1311,78 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
struct inode *inode = file_inode(file);
- struct page *page;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_snap_context *snapc;
+ struct page *page = NULL;
pgoff_t index = pos >> PAGE_SHIFT;
- int r;
+ int pos_in_page = pos & ~PAGE_MASK;
+ int r = 0;
- do {
- /* get a page */
- page = grab_cache_page_write_begin(mapping, index, 0);
- if (!page)
- return -ENOMEM;
+ dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len);
- dout("write_begin file %p inode %p page %p %d~%d\n", file,
- inode, page, (int)pos, (int)len);
+ for (;;) {
+ page = grab_cache_page_write_begin(mapping, index, 0);
+ if (!page) {
+ r = -ENOMEM;
+ break;
+ }
- r = ceph_update_writeable_page(file, pos, len, page);
- if (r < 0)
+ snapc = ceph_find_incompatible(page);
+ if (snapc) {
+ if (IS_ERR(snapc)) {
+ r = PTR_ERR(snapc);
+ break;
+ }
+ unlock_page(page);
put_page(page);
- else
- *pagep = page;
- } while (r == -EAGAIN);
+ page = NULL;
+ ceph_queue_writeback(inode);
+ r = wait_event_killable(ci->i_cap_wq,
+ context_is_writeable_or_written(inode, snapc));
+ ceph_put_snap_context(snapc);
+ if (r != 0)
+ break;
+ continue;
+ }
+
+ if (PageUptodate(page)) {
+ dout(" page %p already uptodate\n", page);
+ break;
+ }
+
+ /*
+ * In some cases we don't need to read at all:
+ * - full page write
+ * - write that lies completely beyond EOF
+ * - write that covers the the page from start to EOF or beyond it
+ */
+ if ((pos_in_page == 0 && len == PAGE_SIZE) ||
+ (pos >= i_size_read(inode)) ||
+ (pos_in_page == 0 && (pos + len) >= i_size_read(inode))) {
+ zero_user_segments(page, 0, pos_in_page,
+ pos_in_page + len, PAGE_SIZE);
+ break;
+ }
+ /*
+ * We need to read it. If we get back -EINPROGRESS, then the page was
+ * handed off to fscache and it will be unlocked when the read completes.
+ * Refind the page in that case so we can reacquire the page lock. Otherwise
+ * we got a hard error or the read was completed synchronously.
+ */
+ r = ceph_do_readpage(file, page);
+ if (r != -EINPROGRESS)
+ break;
+ }
+
+ if (r < 0) {
+ if (page) {
+ unlock_page(page);
+ put_page(page);
+ }
+ } else {
+ *pagep = page;
+ }
return r;
}
@@ -1522,7 +1471,7 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_file_info *fi = vma->vm_file->private_data;
struct page *pinned_page = NULL;
- loff_t off = vmf->pgoff << PAGE_SHIFT;
+ loff_t off = (loff_t)vmf->pgoff << PAGE_SHIFT;
int want, got, err;
sigset_t oldset;
vm_fault_t ret = VM_FAULT_SIGBUS;
@@ -1668,6 +1617,8 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
inode_inc_iversion_raw(inode);
do {
+ struct ceph_snap_context *snapc;
+
lock_page(page);
if (page_mkwrite_check_truncate(page, inode) < 0) {
@@ -1676,13 +1627,26 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
break;
}
- err = ceph_update_writeable_page(vma->vm_file, off, len, page);
- if (err >= 0) {
+ snapc = ceph_find_incompatible(page);
+ if (!snapc) {
/* success. we'll keep the page locked. */
set_page_dirty(page);
ret = VM_FAULT_LOCKED;
+ break;
}
- } while (err == -EAGAIN);
+
+ unlock_page(page);
+
+ if (IS_ERR(snapc)) {
+ ret = VM_FAULT_SIGBUS;
+ break;
+ }
+
+ ceph_queue_writeback(inode);
+ err = wait_event_killable(ci->i_cap_wq,
+ context_is_writeable_or_written(inode, snapc));
+ ceph_put_snap_context(snapc);
+ } while (err == 0);
if (ret == VM_FAULT_LOCKED ||
ci->i_inline_version != CEPH_INLINE_NONE) {
@@ -2039,16 +2003,16 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
if (err >= 0 || err == -ENOENT)
have |= POOL_READ;
else if (err != -EPERM) {
- if (err == -EBLACKLISTED)
- fsc->blacklisted = true;
+ if (err == -EBLOCKLISTED)
+ fsc->blocklisted = true;
goto out_unlock;
}
if (err2 == 0 || err2 == -EEXIST)
have |= POOL_WRITE;
else if (err2 != -EPERM) {
- if (err2 == -EBLACKLISTED)
- fsc->blacklisted = true;
+ if (err2 == -EBLOCKLISTED)
+ fsc->blocklisted = true;
err = err2;
goto out_unlock;
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 034b3f4fdd3a..5027bbdca419 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1222,36 +1222,27 @@ struct cap_msg_args {
};
/*
- * Build and send a cap message to the given MDS.
- *
- * Caller should be holding s_mutex.
+ * cap struct size + flock buffer size + inline version + inline data size +
+ * osd_epoch_barrier + oldest_flush_tid
*/
-static int send_cap_msg(struct cap_msg_args *arg)
+#define CAP_MSG_SIZE (sizeof(struct ceph_mds_caps) + \
+ 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4)
+
+/* Marshal up the cap msg to the MDS */
+static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg)
{
struct ceph_mds_caps *fc;
- struct ceph_msg *msg;
void *p;
- size_t extra_len;
struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc;
- dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
- " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu"
- " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(arg->op),
- arg->cid, arg->ino, ceph_cap_string(arg->caps),
- ceph_cap_string(arg->wanted), ceph_cap_string(arg->dirty),
- arg->seq, arg->issue_seq, arg->flush_tid, arg->oldest_flush_tid,
- arg->mseq, arg->follows, arg->size, arg->max_size,
- arg->xattr_version,
+ dout("%s %s %llx %llx caps %s wanted %s dirty %s seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu xattr_ver %llu xattr_len %d\n",
+ __func__, ceph_cap_op_name(arg->op), arg->cid, arg->ino,
+ ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted),
+ ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq,
+ arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows,
+ arg->size, arg->max_size, arg->xattr_version,
arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0);
- /* flock buffer size + inline version + inline data size +
- * osd_epoch_barrier + oldest_flush_tid */
- extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4;
- msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len,
- GFP_NOFS, false);
- if (!msg)
- return -ENOMEM;
-
msg->hdr.version = cpu_to_le16(10);
msg->hdr.tid = cpu_to_le64(arg->flush_tid);
@@ -1323,9 +1314,6 @@ static int send_cap_msg(struct cap_msg_args *arg)
/* Advisory flags (version 10) */
ceph_encode_32(&p, arg->flags);
-
- ceph_con_send(&arg->session->s_con, msg);
- return 0;
}
/*
@@ -1454,25 +1442,25 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
*
* Caller should hold snap_rwsem (read), s_mutex.
*/
-static void __send_cap(struct ceph_mds_client *mdsc, struct cap_msg_args *arg,
- struct ceph_inode_info *ci)
+static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci)
{
+ struct ceph_msg *msg;
struct inode *inode = &ci->vfs_inode;
- int ret;
- ret = send_cap_msg(arg);
- if (ret < 0) {
- pr_err("error sending cap msg, ino (%llx.%llx) "
- "flushing %s tid %llu, requeue\n",
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);
+ if (!msg) {
+ pr_err("error allocating cap msg: ino (%llx.%llx) flushing %s tid %llu, requeuing cap.\n",
ceph_vinop(inode), ceph_cap_string(arg->dirty),
arg->flush_tid);
spin_lock(&ci->i_ceph_lock);
- __cap_delay_requeue(mdsc, ci);
+ __cap_delay_requeue(arg->session->s_mdsc, ci);
spin_unlock(&ci->i_ceph_lock);
+ return;
}
+ encode_cap_msg(msg, arg);
+ ceph_con_send(&arg->session->s_con, msg);
ceph_buffer_put(arg->old_xattr_buf);
-
if (arg->wake)
wake_up_all(&ci->i_cap_wq);
}
@@ -1483,6 +1471,11 @@ static inline int __send_flush_snap(struct inode *inode,
u32 mseq, u64 oldest_flush_tid)
{
struct cap_msg_args arg;
+ struct ceph_msg *msg;
+
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);
+ if (!msg)
+ return -ENOMEM;
arg.session = session;
arg.ino = ceph_vino(inode).ino;
@@ -1521,7 +1514,9 @@ static inline int __send_flush_snap(struct inode *inode,
arg.flags = 0;
arg.wake = false;
- return send_cap_msg(&arg);
+ encode_cap_msg(msg, &arg);
+ ceph_con_send(&arg.session->s_con, msg);
+ return 0;
}
/*
@@ -1906,9 +1901,8 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci)
void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_session *session)
{
- struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
- struct ceph_mds_client *mdsc = fsc->mdsc;
struct inode *inode = &ci->vfs_inode;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_cap *cap;
u64 flush_tid, oldest_flush_tid;
int file_wanted, used, cap_used;
@@ -1928,12 +1922,24 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
retry:
spin_lock(&ci->i_ceph_lock);
retry_locked:
+ /* Caps wanted by virtue of active open files. */
file_wanted = __ceph_caps_file_wanted(ci);
+
+ /* Caps which have active references against them */
used = __ceph_caps_used(ci);
+
+ /*
+ * "issued" represents the current caps that the MDS wants us to have.
+ * "implemented" is the set that we have been granted, and includes the
+ * ones that have not yet been returned to the MDS (the "revoking" set,
+ * usually because they have outstanding references).
+ */
issued = __ceph_caps_issued(ci, &implemented);
revoking = implemented & ~issued;
want = file_wanted;
+
+ /* The ones we currently want to retain (may be adjusted below) */
retain = file_wanted | used | CEPH_CAP_PIN;
if (!mdsc->stopping && inode->i_nlink > 0) {
if (file_wanted) {
@@ -2011,6 +2017,10 @@ retry_locked:
/* NOTE: no side-effects allowed, until we take s_mutex */
+ /*
+ * If we have an auth cap, we don't need to consider any
+ * overlapping caps as used.
+ */
cap_used = used;
if (ci->i_auth_cap && cap != ci->i_auth_cap)
cap_used &= ~ci->i_auth_cap->issued;
@@ -2148,7 +2158,7 @@ ack:
want, retain, flushing, flush_tid, oldest_flush_tid);
spin_unlock(&ci->i_ceph_lock);
- __send_cap(mdsc, &arg, ci);
+ __send_cap(&arg, ci);
goto retry; /* retake i_ceph_lock and restart our cap scan. */
}
@@ -2222,7 +2232,7 @@ retry_locked:
flushing, flush_tid, oldest_flush_tid);
spin_unlock(&ci->i_ceph_lock);
- __send_cap(mdsc, &arg, ci);
+ __send_cap(&arg, ci);
} else {
if (!list_empty(&ci->i_cap_flush_list)) {
struct ceph_cap_flush *cf =
@@ -2436,7 +2446,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
(cap->issued | cap->implemented),
cf->caps, cf->tid, oldest_flush_tid);
spin_unlock(&ci->i_ceph_lock);
- __send_cap(mdsc, &arg, ci);
+ __send_cap(&arg, ci);
} else {
struct ceph_cap_snap *capsnap =
container_of(cf, struct ceph_cap_snap,
@@ -4284,13 +4294,30 @@ void __ceph_touch_fmode(struct ceph_inode_info *ci,
void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
{
- int i;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb);
int bits = (fmode << 1) | 1;
+ bool is_opened = false;
+ int i;
+
+ if (count == 1)
+ atomic64_inc(&mdsc->metric.opened_files);
+
spin_lock(&ci->i_ceph_lock);
for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
if (bits & (1 << i))
ci->i_nr_by_mode[i] += count;
+
+ /*
+ * If any of the mode ref is larger than 1,
+ * that means it has been already opened by
+ * others. Just skip checking the PIN ref.
+ */
+ if (i && ci->i_nr_by_mode[i] > 1)
+ is_opened = true;
}
+
+ if (!is_opened)
+ percpu_counter_inc(&mdsc->metric.opened_inodes);
spin_unlock(&ci->i_ceph_lock);
}
@@ -4301,15 +4328,32 @@ void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
*/
void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
{
- int i;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb);
int bits = (fmode << 1) | 1;
+ bool is_closed = true;
+ int i;
+
+ if (count == 1)
+ atomic64_dec(&mdsc->metric.opened_files);
+
spin_lock(&ci->i_ceph_lock);
for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
if (bits & (1 << i)) {
BUG_ON(ci->i_nr_by_mode[i] < count);
ci->i_nr_by_mode[i] -= count;
}
+
+ /*
+ * If any of the mode ref is not 0 after
+ * decreased, that means it is still opened
+ * by others. Just skip checking the PIN ref.
+ */
+ if (i && ci->i_nr_by_mode[i])
+ is_closed = false;
}
+
+ if (is_closed)
+ percpu_counter_dec(&mdsc->metric.opened_inodes);
spin_unlock(&ci->i_ceph_lock);
}
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 3e3fcda9b276..7a8fbe3e4751 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -148,6 +148,17 @@ static int metric_show(struct seq_file *s, void *p)
int nr_caps = 0;
s64 total, sum, avg, min, max, sq;
+ sum = percpu_counter_sum(&m->total_inodes);
+ seq_printf(s, "item total\n");
+ seq_printf(s, "------------------------------------------\n");
+ seq_printf(s, "%-35s%lld / %lld\n", "opened files / total inodes",
+ atomic64_read(&m->opened_files), sum);
+ seq_printf(s, "%-35s%lld / %lld\n", "pinned i_caps / total inodes",
+ atomic64_read(&m->total_caps), sum);
+ seq_printf(s, "%-35s%lld / %lld\n", "opened inodes / total inodes",
+ percpu_counter_sum(&m->opened_inodes), sum);
+
+ seq_printf(s, "\n");
seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n");
seq_printf(s, "-----------------------------------------------------------------------------------\n");
@@ -202,7 +213,8 @@ static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
{
struct seq_file *s = p;
- seq_printf(s, "0x%-17llx%-17s%-17s\n", ceph_ino(inode),
+ seq_printf(s, "0x%-17llx%-3d%-17s%-17s\n", ceph_ino(inode),
+ cap->session->s_mds,
ceph_cap_string(cap->issued),
ceph_cap_string(cap->implemented));
return 0;
@@ -222,8 +234,8 @@ static int caps_show(struct seq_file *s, void *p)
"reserved\t%d\n"
"min\t\t%d\n\n",
total, avail, used, reserved, min);
- seq_printf(s, "ino issued implemented\n");
- seq_printf(s, "-----------------------------------------------\n");
+ seq_printf(s, "ino mds issued implemented\n");
+ seq_printf(s, "--------------------------------------------------\n");
mutex_lock(&mdsc->mutex);
for (i = 0; i < mdsc->max_sessions; i++) {
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index d72e4a12bb69..a4d48370b2b3 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -38,8 +38,7 @@ static int __dir_lease_try_check(const struct dentry *dentry);
static int ceph_d_init(struct dentry *dentry)
{
struct ceph_dentry_info *di;
- struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dentry->d_sb);
di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL);
if (!di)
@@ -738,7 +737,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
struct ceph_mds_request *req;
int op;
int mask;
@@ -827,8 +826,7 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
static int ceph_mknod(struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t rdev)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
struct ceph_mds_request *req;
struct ceph_acl_sec_ctx as_ctx = {};
int err;
@@ -889,8 +887,7 @@ static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode,
static int ceph_symlink(struct inode *dir, struct dentry *dentry,
const char *dest)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
struct ceph_mds_request *req;
struct ceph_acl_sec_ctx as_ctx = {};
int err;
@@ -942,8 +939,7 @@ out:
static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
struct ceph_mds_request *req;
struct ceph_acl_sec_ctx as_ctx = {};
int err = -EROFS;
@@ -1010,8 +1006,7 @@ out:
static int ceph_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
struct ceph_mds_request *req;
int err;
@@ -1192,8 +1187,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old_dir->i_sb);
struct ceph_mds_request *req;
int op = CEPH_MDS_OP_RENAME;
int err;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 3f4c993dfc6f..209535d5b8d3 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -182,8 +182,7 @@ static void put_bvecs(struct bio_vec *bvecs, int num_bvecs, bool should_dirty)
static struct ceph_mds_request *
prepare_open_request(struct super_block *sb, int flags, int create_mode)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb);
struct ceph_mds_request *req;
int want_auth = USE_ANY_MDS;
int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
@@ -256,8 +255,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
case S_IFDIR:
ret = ceph_init_file_info(inode, file, fmode,
S_ISDIR(inode->i_mode));
- if (ret)
- return ret;
break;
case S_IFLNK:
@@ -285,7 +282,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
*/
int ceph_renew_caps(struct inode *inode, int fmode)
{
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_request *req;
int err, flags, wanted;
@@ -865,6 +862,8 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
size_t page_off;
u64 i_size;
bool more;
+ int idx;
+ size_t left;
req = ceph_osdc_new_request(osdc, &ci->i_layout,
ci->i_vino, off, &len, 0, 1,
@@ -878,29 +877,13 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
more = len < iov_iter_count(to);
- if (unlikely(iov_iter_is_pipe(to))) {
- ret = iov_iter_get_pages_alloc(to, &pages, len,
- &page_off);
- if (ret <= 0) {
- ceph_osdc_put_request(req);
- ret = -ENOMEM;
- break;
- }
- num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);
- if (ret < len) {
- len = ret;
- osd_req_op_extent_update(req, 0, len);
- more = false;
- }
- } else {
- num_pages = calc_pages_for(off, len);
- page_off = off & ~PAGE_MASK;
- pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
- if (IS_ERR(pages)) {
- ceph_osdc_put_request(req);
- ret = PTR_ERR(pages);
- break;
- }
+ num_pages = calc_pages_for(off, len);
+ page_off = off & ~PAGE_MASK;
+ pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+ if (IS_ERR(pages)) {
+ ceph_osdc_put_request(req);
+ ret = PTR_ERR(pages);
+ break;
}
osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
@@ -931,36 +914,27 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
ret += zlen;
}
- if (unlikely(iov_iter_is_pipe(to))) {
- if (ret > 0) {
- iov_iter_advance(to, ret);
- off += ret;
- } else {
- iov_iter_advance(to, 0);
- }
- ceph_put_page_vector(pages, num_pages, false);
- } else {
- int idx = 0;
- size_t left = ret > 0 ? ret : 0;
- while (left > 0) {
- size_t len, copied;
- page_off = off & ~PAGE_MASK;
- len = min_t(size_t, left, PAGE_SIZE - page_off);
- copied = copy_page_to_iter(pages[idx++],
- page_off, len, to);
- off += copied;
- left -= copied;
- if (copied < len) {
- ret = -EFAULT;
- break;
- }
+ idx = 0;
+ left = ret > 0 ? ret : 0;
+ while (left > 0) {
+ size_t len, copied;
+ page_off = off & ~PAGE_MASK;
+ len = min_t(size_t, left, PAGE_SIZE - page_off);
+ SetPageUptodate(pages[idx]);
+ copied = copy_page_to_iter(pages[idx++],
+ page_off, len, to);
+ off += copied;
+ left -= copied;
+ if (copied < len) {
+ ret = -EFAULT;
+ break;
}
- ceph_release_page_vector(pages, num_pages);
}
+ ceph_release_page_vector(pages, num_pages);
if (ret < 0) {
- if (ret == -EBLACKLISTED)
- fsc->blacklisted = true;
+ if (ret == -EBLOCKLISTED)
+ fsc->blocklisted = true;
break;
}
@@ -1052,8 +1026,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
struct inode *inode = req->r_inode;
struct ceph_aio_request *aio_req = req->r_priv;
struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_client_metric *metric = &fsc->mdsc->metric;
+ struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric;
BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS);
BUG_ON(!osd_data->num_bvecs);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index d163fa96cb40..526faf4778ce 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -42,10 +42,13 @@ static void ceph_inode_work(struct work_struct *work);
static int ceph_set_ino_cb(struct inode *inode, void *data)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
ci->i_vino = *(struct ceph_vino *)data;
inode->i_ino = ceph_vino_to_ino_t(ci->i_vino);
inode_set_iversion_raw(inode, 0);
+ percpu_counter_inc(&mdsc->metric.total_inodes);
+
return 0;
}
@@ -538,11 +541,14 @@ void ceph_free_inode(struct inode *inode)
void ceph_evict_inode(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_inode_frag *frag;
struct rb_node *n;
dout("evict_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
+ percpu_counter_dec(&mdsc->metric.total_inodes);
+
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
@@ -558,8 +564,6 @@ void ceph_evict_inode(struct inode *inode)
* caps in i_snap_caps.
*/
if (ci->i_snap_realm) {
- struct ceph_mds_client *mdsc =
- ceph_inode_to_client(inode)->mdsc;
if (ceph_snap(inode) == CEPH_NOSNAP) {
struct ceph_snap_realm *realm = ci->i_snap_realm;
dout(" dropping residual ref to snap realm %p\n",
@@ -739,7 +743,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
struct ceph_mds_session *session, int cap_fmode,
struct ceph_cap_reservation *caps_reservation)
{
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_mds_reply_inode *info = iinfo->in;
struct ceph_inode_info *ci = ceph_inode(inode);
int issued, new_issued, info_caps;
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index d6b9166e71e4..048a435a29be 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -63,7 +63,7 @@ static const struct file_lock_operations ceph_fl_lock_ops = {
static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
int cmd, u8 wait, struct file_lock *fl)
{
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_mds_request *req;
int err;
u64 length = 0;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 4a26862d7667..08f1c0c31dc2 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3303,7 +3303,7 @@ bad:
}
static int __decode_session_metadata(void **p, void *end,
- bool *blacklisted)
+ bool *blocklisted)
{
/* map<string,string> */
u32 n;
@@ -3317,8 +3317,12 @@ static int __decode_session_metadata(void **p, void *end,
*p += len;
ceph_decode_32_safe(p, end, len, bad);
ceph_decode_need(p, end, len, bad);
+ /*
+ * Match "blocklisted (blacklisted)" from newer MDSes,
+ * or "blacklisted" from older MDSes.
+ */
if (err_str && strnstr(*p, "blacklisted", len))
- *blacklisted = true;
+ *blocklisted = true;
*p += len;
}
return 0;
@@ -3341,7 +3345,7 @@ static void handle_session(struct ceph_mds_session *session,
u32 op;
u64 seq, features = 0;
int wake = 0;
- bool blacklisted = false;
+ bool blocklisted = false;
/* decode */
ceph_decode_need(&p, end, sizeof(*h), bad);
@@ -3354,7 +3358,7 @@ static void handle_session(struct ceph_mds_session *session,
if (msg_version >= 3) {
u32 len;
/* version >= 2, metadata */
- if (__decode_session_metadata(&p, end, &blacklisted) < 0)
+ if (__decode_session_metadata(&p, end, &blocklisted) < 0)
goto bad;
/* version >= 3, feature bits */
ceph_decode_32_safe(&p, end, len, bad);
@@ -3445,8 +3449,8 @@ static void handle_session(struct ceph_mds_session *session,
session->s_state = CEPH_MDS_SESSION_REJECTED;
cleanup_session_requests(mdsc, session);
remove_session_caps(session);
- if (blacklisted)
- mdsc->fsc->blacklisted = true;
+ if (blocklisted)
+ mdsc->fsc->blocklisted = true;
wake = 2; /* for good measure */
break;
@@ -3612,6 +3616,39 @@ fail_msg:
return err;
}
+static struct dentry* d_find_primary(struct inode *inode)
+{
+ struct dentry *alias, *dn = NULL;
+
+ if (hlist_empty(&inode->i_dentry))
+ return NULL;
+
+ spin_lock(&inode->i_lock);
+ if (hlist_empty(&inode->i_dentry))
+ goto out_unlock;
+
+ if (S_ISDIR(inode->i_mode)) {
+ alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
+ if (!IS_ROOT(alias))
+ dn = dget(alias);
+ goto out_unlock;
+ }
+
+ hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
+ spin_lock(&alias->d_lock);
+ if (!d_unhashed(alias) &&
+ (ceph_dentry(alias)->flags & CEPH_DENTRY_PRIMARY_LINK)) {
+ dn = dget_dlock(alias);
+ }
+ spin_unlock(&alias->d_lock);
+ if (dn)
+ break;
+ }
+out_unlock:
+ spin_unlock(&inode->i_lock);
+ return dn;
+}
+
/*
* Encode information about a cap for a reconnect with the MDS.
*/
@@ -3625,13 +3662,32 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
struct ceph_inode_info *ci = cap->ci;
struct ceph_reconnect_state *recon_state = arg;
struct ceph_pagelist *pagelist = recon_state->pagelist;
- int err;
+ struct dentry *dentry;
+ char *path;
+ int pathlen, err;
+ u64 pathbase;
u64 snap_follows;
dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
inode, ceph_vinop(inode), cap, cap->cap_id,
ceph_cap_string(cap->issued));
+ dentry = d_find_primary(inode);
+ if (dentry) {
+ /* set pathbase to parent dir when msg_version >= 2 */
+ path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase,
+ recon_state->msg_version >= 2);
+ dput(dentry);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ goto out_err;
+ }
+ } else {
+ path = NULL;
+ pathlen = 0;
+ pathbase = 0;
+ }
+
spin_lock(&ci->i_ceph_lock);
cap->seq = 0; /* reset cap seq */
cap->issue_seq = 0; /* and issue_seq */
@@ -3652,7 +3708,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
rec.v2.issued = cpu_to_le32(cap->issued);
rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
- rec.v2.pathbase = 0;
+ rec.v2.pathbase = cpu_to_le64(pathbase);
rec.v2.flock_len = (__force __le32)
((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
} else {
@@ -3663,7 +3719,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime);
ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime);
rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
- rec.v1.pathbase = 0;
+ rec.v1.pathbase = cpu_to_le64(pathbase);
}
if (list_empty(&ci->i_cap_snaps)) {
@@ -3725,7 +3781,7 @@ encode_again:
sizeof(struct ceph_filelock);
rec.v2.flock_len = cpu_to_le32(struct_len);
- struct_len += sizeof(u32) + sizeof(rec.v2);
+ struct_len += sizeof(u32) + pathlen + sizeof(rec.v2);
if (struct_v >= 2)
struct_len += sizeof(u64); /* snap_follows */
@@ -3749,7 +3805,7 @@ encode_again:
ceph_pagelist_encode_8(pagelist, 1);
ceph_pagelist_encode_32(pagelist, struct_len);
}
- ceph_pagelist_encode_string(pagelist, NULL, 0);
+ ceph_pagelist_encode_string(pagelist, path, pathlen);
ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
ceph_locks_to_pagelist(flocks, pagelist,
num_fcntl_locks, num_flock_locks);
@@ -3758,39 +3814,20 @@ encode_again:
out_freeflocks:
kfree(flocks);
} else {
- u64 pathbase = 0;
- int pathlen = 0;
- char *path = NULL;
- struct dentry *dentry;
-
- dentry = d_find_alias(inode);
- if (dentry) {
- path = ceph_mdsc_build_path(dentry,
- &pathlen, &pathbase, 0);
- dput(dentry);
- if (IS_ERR(path)) {
- err = PTR_ERR(path);
- goto out_err;
- }
- rec.v1.pathbase = cpu_to_le64(pathbase);
- }
-
err = ceph_pagelist_reserve(pagelist,
sizeof(u64) + sizeof(u32) +
pathlen + sizeof(rec.v1));
- if (err) {
- goto out_freepath;
- }
+ if (err)
+ goto out_err;
ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
ceph_pagelist_encode_string(pagelist, path, pathlen);
ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
-out_freepath:
- ceph_mdsc_free_path(path, pathlen);
}
out_err:
- if (err >= 0)
+ ceph_mdsc_free_path(path, pathlen);
+ if (!err)
recon_state->nr_caps++;
return err;
}
@@ -4334,14 +4371,14 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)
if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED)
return;
- if (!READ_ONCE(fsc->blacklisted))
+ if (!READ_ONCE(fsc->blocklisted))
return;
if (fsc->last_auto_reconnect &&
time_before(jiffies, fsc->last_auto_reconnect + HZ * 60 * 30))
return;
- pr_info("auto reconnect after blacklisted\n");
+ pr_info("auto reconnect after blocklisted\n");
fsc->last_auto_reconnect = jiffies;
ceph_force_reconnect(fsc->sb);
}
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 658800605bfb..cbf8af437140 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -393,7 +393,7 @@ struct ceph_mds_client {
struct ceph_mds_session **sessions; /* NULL for mds if no session */
atomic_t num_sessions;
- int max_sessions; /* len of s_mds_sessions */
+ int max_sessions; /* len of sessions array */
int stopping; /* true if shutting down */
atomic64_t quotarealms_count; /* # realms with quota */
diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c
index 2466b261fba2..fee4c4778313 100644
--- a/fs/ceph/metric.c
+++ b/fs/ceph/metric.c
@@ -192,11 +192,23 @@ int ceph_metric_init(struct ceph_client_metric *m)
m->total_metadatas = 0;
m->metadata_latency_sum = 0;
+ atomic64_set(&m->opened_files, 0);
+ ret = percpu_counter_init(&m->opened_inodes, 0, GFP_KERNEL);
+ if (ret)
+ goto err_opened_inodes;
+ ret = percpu_counter_init(&m->total_inodes, 0, GFP_KERNEL);
+ if (ret)
+ goto err_total_inodes;
+
m->session = NULL;
INIT_DELAYED_WORK(&m->delayed_work, metric_delayed_work);
return 0;
+err_total_inodes:
+ percpu_counter_destroy(&m->opened_inodes);
+err_opened_inodes:
+ percpu_counter_destroy(&m->i_caps_mis);
err_i_caps_mis:
percpu_counter_destroy(&m->i_caps_hit);
err_i_caps_hit:
@@ -212,6 +224,8 @@ void ceph_metric_destroy(struct ceph_client_metric *m)
if (!m)
return;
+ percpu_counter_destroy(&m->total_inodes);
+ percpu_counter_destroy(&m->opened_inodes);
percpu_counter_destroy(&m->i_caps_mis);
percpu_counter_destroy(&m->i_caps_hit);
percpu_counter_destroy(&m->d_lease_mis);
diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h
index 1d0959d669d7..710f3f1dceab 100644
--- a/fs/ceph/metric.h
+++ b/fs/ceph/metric.h
@@ -115,6 +115,13 @@ struct ceph_client_metric {
ktime_t metadata_latency_min;
ktime_t metadata_latency_max;
+ /* The total number of directories and files that are opened */
+ atomic64_t opened_files;
+
+ /* The total number of inodes that have opened files or directories */
+ struct percpu_counter opened_inodes;
+ struct percpu_counter total_inodes;
+
struct ceph_mds_session *session;
struct delayed_work delayed_work; /* delayed work */
};
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index cc2c4d40b022..83cb4f26b689 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -12,7 +12,7 @@
void ceph_adjust_quota_realms_count(struct inode *inode, bool inc)
{
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
if (inc)
atomic64_inc(&mdsc->quotarealms_count);
else
@@ -21,8 +21,8 @@ void ceph_adjust_quota_realms_count(struct inode *inode, bool inc)
static inline bool ceph_has_realms_with_quotas(struct inode *inode)
{
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
- struct super_block *sb = mdsc->fsc->sb;
+ struct super_block *sb = inode->i_sb;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb);
struct inode *root = d_inode(sb->s_root);
if (atomic64_read(&mdsc->quotarealms_count) > 0)
@@ -266,7 +266,7 @@ restart:
static bool ceph_quota_is_same_realm(struct inode *old, struct inode *new)
{
- struct ceph_mds_client *mdsc = ceph_inode_to_client(old)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old->i_sb);
struct ceph_snap_realm *old_realm, *new_realm;
bool is_same;
@@ -313,7 +313,7 @@ enum quota_check_op {
static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op,
loff_t delta)
{
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_inode_info *ci;
struct ceph_snap_realm *realm, *next;
struct inode *in;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 923be9399b21..0da39c16dab4 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -602,7 +602,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap)
{
struct inode *inode = &ci->vfs_inode;
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
BUG_ON(capsnap->writing);
capsnap->size = inode->i_size;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 7ec0e6d03d10..33ba6f0aa55c 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -104,8 +104,7 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
le64_to_cpu(*((__le64 *)&monc->monmap->fsid + 1));
mutex_unlock(&monc->mutex);
- buf->f_fsid.val[0] = fsid & 0xffffffff;
- buf->f_fsid.val[1] = fsid >> 32;
+ buf->f_fsid = u64_to_fsid(fsid);
return 0;
}
@@ -1205,14 +1204,13 @@ nomem:
static void ceph_kill_sb(struct super_block *s)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(s);
- dev_t dev = s->s_dev;
dout("kill_sb %p\n", s);
ceph_mdsc_pre_umount(fsc->mdsc);
flush_fs_workqueues(fsc);
- generic_shutdown_super(s);
+ kill_anon_super(s);
fsc->client->extra_mon_dispatch = NULL;
ceph_fs_debugfs_cleanup(fsc);
@@ -1220,7 +1218,6 @@ static void ceph_kill_sb(struct super_block *s)
ceph_fscache_unregister_fs(fsc);
destroy_fs_client(fsc);
- free_anon_bdev(dev);
}
static struct file_system_type ceph_fs_type = {
@@ -1243,13 +1240,13 @@ int ceph_force_reconnect(struct super_block *sb)
* see remove_session_caps_cb() */
flush_workqueue(fsc->inode_wq);
- /* In case that we were blacklisted. This also reset
+ /* In case that we were blocklisted. This also reset
* all mon/osd connections */
ceph_reset_client_addr(fsc->client);
ceph_osdc_clear_abort_err(&fsc->client->osdc);
- fsc->blacklisted = false;
+ fsc->blocklisted = false;
fsc->mount_state = CEPH_MOUNT_MOUNTED;
if (sb->s_root) {
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index a3995ebe0623..482473e4cce1 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -32,7 +32,7 @@
#define CEPH_BLOCK_SHIFT 22 /* 4 MB */
#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
-#define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blacklisted */
+#define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blocklisted */
#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
@@ -109,7 +109,7 @@ struct ceph_fs_client {
unsigned long mount_state;
unsigned long last_auto_reconnect;
- bool blacklisted;
+ bool blocklisted;
bool have_copy_from2;
@@ -160,7 +160,8 @@ struct ceph_cap {
int issued; /* latest, from the mds */
int implemented; /* implemented superset of
issued (for revocation) */
- int mds, mds_wanted;
+ int mds; /* mds index for this cap */
+ int mds_wanted; /* caps wanted from this mds */
};
/* caps to release */
struct {
@@ -451,6 +452,12 @@ ceph_sb_to_client(const struct super_block *sb)
return (struct ceph_fs_client *)sb->s_fs_info;
}
+static inline struct ceph_mds_client *
+ceph_sb_to_mdsc(const struct super_block *sb)
+{
+ return (struct ceph_mds_client *)ceph_sb_to_client(sb)->mdsc;
+}
+
static inline struct ceph_vino
ceph_vino(const struct inode *inode)
{
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 3a733ac33d9b..197cb1234341 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -116,7 +116,8 @@ static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
* NULL terminates however, so call it on a temporary buffer and then memcpy
* the result into place.
*/
-static int ceph_fmt_xattr(char *val, size_t size, const char *fmt, ...)
+static __printf(3, 4)
+int ceph_fmt_xattr(char *val, size_t size, const char *fmt, ...)
{
int ret;
va_list args;
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 51bae9340842..cd17d0e50f2a 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -10,7 +10,7 @@ cifs-y := trace.o cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o \
cifs_unicode.o nterr.o cifsencrypt.o \
readdir.o ioctl.o sess.o export.o smb1ops.o winucase.o \
smb2ops.o smb2maperror.o smb2transport.o \
- smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o
+ smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o fs_context.o
cifs-$(CONFIG_CIFS_XATTR) += xattr.o
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index 689162e2e175..3150c19cdc2f 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -530,8 +530,8 @@ decode_negTokenInit(unsigned char *security_blob, int length,
return 0;
} else if ((cls != ASN1_CTX) || (con != ASN1_CON)
|| (tag != ASN1_EOC)) {
- cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 0\n",
- cls, con, tag, end, *end);
+ cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p exit 0\n",
+ cls, con, tag, end);
return 0;
}
@@ -541,8 +541,8 @@ decode_negTokenInit(unsigned char *security_blob, int length,
return 0;
} else if ((cls != ASN1_UNI) || (con != ASN1_CON)
|| (tag != ASN1_SEQ)) {
- cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 1\n",
- cls, con, tag, end, *end);
+ cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p exit 1\n",
+ cls, con, tag, end);
return 0;
}
@@ -552,8 +552,8 @@ decode_negTokenInit(unsigned char *security_blob, int length,
return 0;
} else if ((cls != ASN1_CTX) || (con != ASN1_CON)
|| (tag != ASN1_EOC)) {
- cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 0\n",
- cls, con, tag, end, *end);
+ cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p exit 0\n",
+ cls, con, tag, end);
return 0;
}
@@ -564,8 +564,8 @@ decode_negTokenInit(unsigned char *security_blob, int length,
return 0;
} else if ((cls != ASN1_UNI) || (con != ASN1_CON)
|| (tag != ASN1_SEQ)) {
- cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 1\n",
- cls, con, tag, end, *end);
+ cifs_dbg(FYI, "cls = %d con = %d tag = %d sequence_end = %p exit 1\n",
+ cls, con, tag, sequence_end);
return 0;
}
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 498777d859eb..9bd03a231032 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -488,7 +488,13 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
else if (map_chars == SFM_MAP_UNI_RSVD) {
bool end_of_string;
- if (i == srclen - 1)
+ /**
+ * Remap spaces and periods found at the end of every
+ * component of the path. The special cases of '.' and
+ * '..' do not need to be dealt with explicitly because
+ * they are addressed in namei.c:link_path_walk().
+ **/
+ if ((i == srclen - 1) || (source[i+1] == '\\'))
end_of_string = true;
else
end_of_string = false;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index fcff14ef1c70..23b21e943652 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -338,7 +338,7 @@ invalidate_key:
goto out_key_put;
}
-static int
+int
sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
struct cifs_fattr *fattr, uint sidtype)
{
@@ -359,7 +359,8 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
return -EIO;
}
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL) {
+ if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL) ||
+ (cifs_sb_master_tcon(cifs_sb)->posix_extensions)) {
uint32_t unix_id;
bool is_group;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 0fb99d25e8a8..472cb7777e3e 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -71,6 +71,8 @@ bool enable_oplocks = true;
bool linuxExtEnabled = true;
bool lookupCacheEnabled = true;
bool disable_legacy_dialects; /* false by default */
+bool enable_gcm_256; /* false by default, change when more servers support it */
+bool require_gcm_256; /* false by default */
unsigned int global_secflags = CIFSSEC_DEF;
/* unsigned int ntlmv2_support = 0; */
unsigned int sign_CIFS_PDUs = 1;
@@ -104,6 +106,12 @@ MODULE_PARM_DESC(slow_rsp_threshold, "Amount of time (in seconds) to wait "
module_param(enable_oplocks, bool, 0644);
MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1");
+module_param(enable_gcm_256, bool, 0644);
+MODULE_PARM_DESC(enable_gcm_256, "Enable requesting strongest (256 bit) GCM encryption. Default: n/N/0");
+
+module_param(require_gcm_256, bool, 0644);
+MODULE_PARM_DESC(require_gcm_256, "Require strongest (256 bit) GCM encryption. Default: n/N/0");
+
module_param(disable_legacy_dialects, bool, 0644);
MODULE_PARM_DESC(disable_legacy_dialects, "To improve security it may be "
"helpful to restrict the ability to "
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 99b3180c613a..905d03863721 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -156,5 +156,5 @@ extern int cifs_truncate_page(struct address_space *mapping, loff_t from);
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.28"
+#define CIFS_VERSION "2.29"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index b565d83ba89e..484ec2d8c5c9 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -195,18 +195,6 @@ struct smb_rqst {
unsigned int rq_tailsz; /* length of last page */
};
-enum smb_version {
- Smb_1 = 1,
- Smb_20,
- Smb_21,
- Smb_30,
- Smb_302,
- Smb_311,
- Smb_3any,
- Smb_default,
- Smb_version_err
-};
-
struct mid_q_entry;
struct TCP_Server_Info;
struct cifsFileInfo;
@@ -310,6 +298,10 @@ struct smb_version_operations {
/* query file data from the server */
int (*query_file_info)(const unsigned int, struct cifs_tcon *,
struct cifs_fid *, FILE_ALL_INFO *);
+ /* query reparse tag from srv to determine which type of special file */
+ int (*query_reparse_tag)(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const char *path,
+ __u32 *reparse_tag);
/* get server index number */
int (*get_srv_inum)(const unsigned int, struct cifs_tcon *,
struct cifs_sb_info *, const char *,
@@ -510,6 +502,8 @@ struct smb_version_operations {
struct fiemap_extent_info *, u64, u64);
/* version specific llseek implementation */
loff_t (*llseek)(struct file *, struct cifs_tcon *, loff_t, int);
+ /* Check for STATUS_IO_TIMEOUT */
+ bool (*is_status_io_timeout)(char *buf);
};
struct smb_version_values {
@@ -1954,6 +1948,8 @@ extern bool lookupCacheEnabled;
extern unsigned int global_secflags; /* if on, session setup sent
with more secure ntlmssp2 challenge/resp */
extern unsigned int sign_CIFS_PDUs; /* enable smb packet signing */
+extern bool enable_gcm_256; /* allow optional negotiate of strongest signing (aes-gcm-256) */
+extern bool require_gcm_256; /* require use of strongest signing (aes-gcm-256) */
extern bool linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
extern unsigned int CIFSMaxBufSize; /* max size not including hdr */
extern unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index bb68cbf81074..24c6f36177ba 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -209,6 +209,8 @@ extern int cifs_set_file_info(struct inode *inode, struct iattr *attrs,
extern int cifs_rename_pending_delete(const char *full_path,
struct dentry *dentry,
const unsigned int xid);
+extern int sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
+ struct cifs_fattr *fattr, uint sidtype);
extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
struct cifs_fattr *fattr, struct inode *inode,
bool get_mode_from_special_sid,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index a5731dd6e656..c38156f324dd 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -61,6 +61,7 @@
#ifdef CONFIG_CIFS_DFS_UPCALL
#include "dfs_cache.h"
#endif
+#include "fs_context.h"
extern mempool_t *cifs_req_poolp;
extern bool disable_legacy_dialects;
@@ -69,6 +70,9 @@ extern bool disable_legacy_dialects;
#define TLINK_ERROR_EXPIRE (1 * HZ)
#define TLINK_IDLE_EXPIRE (600 * HZ)
+/* Drop the connection to not overload the server */
+#define NUM_STATUS_IO_TIMEOUT 5
+
enum {
/* Mount options that take no arguments */
Opt_user_xattr, Opt_nouser_xattr,
@@ -276,66 +280,6 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_err, NULL }
};
-enum {
- Opt_sec_krb5, Opt_sec_krb5i, Opt_sec_krb5p,
- Opt_sec_ntlmsspi, Opt_sec_ntlmssp,
- Opt_ntlm, Opt_sec_ntlmi, Opt_sec_ntlmv2,
- Opt_sec_ntlmv2i, Opt_sec_lanman,
- Opt_sec_none,
-
- Opt_sec_err
-};
-
-static const match_table_t cifs_secflavor_tokens = {
- { Opt_sec_krb5, "krb5" },
- { Opt_sec_krb5i, "krb5i" },
- { Opt_sec_krb5p, "krb5p" },
- { Opt_sec_ntlmsspi, "ntlmsspi" },
- { Opt_sec_ntlmssp, "ntlmssp" },
- { Opt_ntlm, "ntlm" },
- { Opt_sec_ntlmi, "ntlmi" },
- { Opt_sec_ntlmv2, "nontlm" },
- { Opt_sec_ntlmv2, "ntlmv2" },
- { Opt_sec_ntlmv2i, "ntlmv2i" },
- { Opt_sec_lanman, "lanman" },
- { Opt_sec_none, "none" },
-
- { Opt_sec_err, NULL }
-};
-
-/* cache flavors */
-enum {
- Opt_cache_loose,
- Opt_cache_strict,
- Opt_cache_none,
- Opt_cache_ro,
- Opt_cache_rw,
- Opt_cache_err
-};
-
-static const match_table_t cifs_cacheflavor_tokens = {
- { Opt_cache_loose, "loose" },
- { Opt_cache_strict, "strict" },
- { Opt_cache_none, "none" },
- { Opt_cache_ro, "ro" },
- { Opt_cache_rw, "singleclient" },
- { Opt_cache_err, NULL }
-};
-
-static const match_table_t cifs_smb_version_tokens = {
- { Smb_1, SMB1_VERSION_STRING },
- { Smb_20, SMB20_VERSION_STRING},
- { Smb_21, SMB21_VERSION_STRING },
- { Smb_30, SMB30_VERSION_STRING },
- { Smb_302, SMB302_VERSION_STRING },
- { Smb_302, ALT_SMB302_VERSION_STRING },
- { Smb_311, SMB311_VERSION_STRING },
- { Smb_311, ALT_SMB311_VERSION_STRING },
- { Smb_3any, SMB3ANY_VERSION_STRING },
- { Smb_default, SMBDEFAULT_VERSION_STRING },
- { Smb_version_err, NULL }
-};
-
static int ip_connect(struct TCP_Server_Info *server);
static int generic_ip_connect(struct TCP_Server_Info *server);
static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink);
@@ -1117,7 +1061,7 @@ cifs_demultiplex_thread(void *p)
struct task_struct *task_to_wake = NULL;
struct mid_q_entry *mids[MAX_COMPOUND];
char *bufs[MAX_COMPOUND];
- unsigned int noreclaim_flag;
+ unsigned int noreclaim_flag, num_io_timeout = 0;
noreclaim_flag = memalloc_noreclaim_save();
cifs_dbg(FYI, "Demultiplex PID: %d\n", task_pid_nr(current));
@@ -1213,6 +1157,16 @@ next_pdu:
continue;
}
+ if (server->ops->is_status_io_timeout &&
+ server->ops->is_status_io_timeout(buf)) {
+ num_io_timeout++;
+ if (num_io_timeout > NUM_STATUS_IO_TIMEOUT) {
+ cifs_reconnect(server);
+ num_io_timeout = 0;
+ continue;
+ }
+ }
+
server->lstrp = jiffies;
for (i = 0; i < num_mids; i++) {
@@ -1359,177 +1313,6 @@ static int get_option_gid(substring_t args[], kgid_t *result)
return 0;
}
-static int cifs_parse_security_flavors(char *value,
- struct smb_vol *vol)
-{
-
- substring_t args[MAX_OPT_ARGS];
-
- /*
- * With mount options, the last one should win. Reset any existing
- * settings back to default.
- */
- vol->sectype = Unspecified;
- vol->sign = false;
-
- switch (match_token(value, cifs_secflavor_tokens, args)) {
- case Opt_sec_krb5p:
- cifs_dbg(VFS, "sec=krb5p is not supported!\n");
- return 1;
- case Opt_sec_krb5i:
- vol->sign = true;
- fallthrough;
- case Opt_sec_krb5:
- vol->sectype = Kerberos;
- break;
- case Opt_sec_ntlmsspi:
- vol->sign = true;
- fallthrough;
- case Opt_sec_ntlmssp:
- vol->sectype = RawNTLMSSP;
- break;
- case Opt_sec_ntlmi:
- vol->sign = true;
- fallthrough;
- case Opt_ntlm:
- vol->sectype = NTLM;
- break;
- case Opt_sec_ntlmv2i:
- vol->sign = true;
- fallthrough;
- case Opt_sec_ntlmv2:
- vol->sectype = NTLMv2;
- break;
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
- case Opt_sec_lanman:
- vol->sectype = LANMAN;
- break;
-#endif
- case Opt_sec_none:
- vol->nullauth = 1;
- break;
- default:
- cifs_dbg(VFS, "bad security option: %s\n", value);
- return 1;
- }
-
- return 0;
-}
-
-static int
-cifs_parse_cache_flavor(char *value, struct smb_vol *vol)
-{
- substring_t args[MAX_OPT_ARGS];
-
- switch (match_token(value, cifs_cacheflavor_tokens, args)) {
- case Opt_cache_loose:
- vol->direct_io = false;
- vol->strict_io = false;
- vol->cache_ro = false;
- vol->cache_rw = false;
- break;
- case Opt_cache_strict:
- vol->direct_io = false;
- vol->strict_io = true;
- vol->cache_ro = false;
- vol->cache_rw = false;
- break;
- case Opt_cache_none:
- vol->direct_io = true;
- vol->strict_io = false;
- vol->cache_ro = false;
- vol->cache_rw = false;
- break;
- case Opt_cache_ro:
- vol->direct_io = false;
- vol->strict_io = false;
- vol->cache_ro = true;
- vol->cache_rw = false;
- break;
- case Opt_cache_rw:
- vol->direct_io = false;
- vol->strict_io = false;
- vol->cache_ro = false;
- vol->cache_rw = true;
- break;
- default:
- cifs_dbg(VFS, "bad cache= option: %s\n", value);
- return 1;
- }
- return 0;
-}
-
-static int
-cifs_parse_smb_version(char *value, struct smb_vol *vol, bool is_smb3)
-{
- substring_t args[MAX_OPT_ARGS];
-
- switch (match_token(value, cifs_smb_version_tokens, args)) {
-#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
- case Smb_1:
- if (disable_legacy_dialects) {
- cifs_dbg(VFS, "mount with legacy dialect disabled\n");
- return 1;
- }
- if (is_smb3) {
- cifs_dbg(VFS, "vers=1.0 (cifs) not permitted when mounting with smb3\n");
- return 1;
- }
- cifs_dbg(VFS, "Use of the less secure dialect vers=1.0 is not recommended unless required for access to very old servers\n");
- vol->ops = &smb1_operations;
- vol->vals = &smb1_values;
- break;
- case Smb_20:
- if (disable_legacy_dialects) {
- cifs_dbg(VFS, "mount with legacy dialect disabled\n");
- return 1;
- }
- if (is_smb3) {
- cifs_dbg(VFS, "vers=2.0 not permitted when mounting with smb3\n");
- return 1;
- }
- vol->ops = &smb20_operations;
- vol->vals = &smb20_values;
- break;
-#else
- case Smb_1:
- cifs_dbg(VFS, "vers=1.0 (cifs) mount not permitted when legacy dialects disabled\n");
- return 1;
- case Smb_20:
- cifs_dbg(VFS, "vers=2.0 mount not permitted when legacy dialects disabled\n");
- return 1;
-#endif /* CIFS_ALLOW_INSECURE_LEGACY */
- case Smb_21:
- vol->ops = &smb21_operations;
- vol->vals = &smb21_values;
- break;
- case Smb_30:
- vol->ops = &smb30_operations;
- vol->vals = &smb30_values;
- break;
- case Smb_302:
- vol->ops = &smb30_operations; /* currently identical with 3.0 */
- vol->vals = &smb302_values;
- break;
- case Smb_311:
- vol->ops = &smb311_operations;
- vol->vals = &smb311_values;
- break;
- case Smb_3any:
- vol->ops = &smb30_operations; /* currently identical with 3.0 */
- vol->vals = &smb3any_values;
- break;
- case Smb_default:
- vol->ops = &smb30_operations; /* currently identical with 3.0 */
- vol->vals = &smbdefault_values;
- break;
- default:
- cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value);
- return 1;
- }
- return 0;
-}
-
/*
* Parse a devname into substrings and populate the vol->UNC and vol->prepath
* fields with the result. Returns 0 on success and an error otherwise.
@@ -3595,7 +3378,10 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
*/
tcon->retry = volume_info->retry;
tcon->nocase = volume_info->nocase;
- tcon->nohandlecache = volume_info->nohandlecache;
+ if (ses->server->capabilities & SMB2_GLOBAL_CAP_DIRECTORY_LEASING)
+ tcon->nohandlecache = volume_info->nohandlecache;
+ else
+ tcon->nohandlecache = 1;
tcon->nodelete = volume_info->nodelete;
tcon->local_lease = volume_info->local_lease;
INIT_LIST_HEAD(&tcon->pending_opens);
@@ -3889,13 +3675,21 @@ generic_ip_connect(struct TCP_Server_Info *server)
saddr = (struct sockaddr *) &server->dstaddr;
if (server->dstaddr.ss_family == AF_INET6) {
- sport = ((struct sockaddr_in6 *) saddr)->sin6_port;
+ struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)&server->dstaddr;
+
+ sport = ipv6->sin6_port;
slen = sizeof(struct sockaddr_in6);
sfamily = AF_INET6;
+ cifs_dbg(FYI, "%s: connecting to [%pI6]:%d\n", __func__, &ipv6->sin6_addr,
+ ntohs(sport));
} else {
- sport = ((struct sockaddr_in *) saddr)->sin_port;
+ struct sockaddr_in *ipv4 = (struct sockaddr_in *)&server->dstaddr;
+
+ sport = ipv4->sin_port;
slen = sizeof(struct sockaddr_in);
sfamily = AF_INET;
+ cifs_dbg(FYI, "%s: connecting to %pI4:%d\n", __func__, &ipv4->sin_addr,
+ ntohs(sport));
}
if (socket == NULL) {
diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c
new file mode 100644
index 000000000000..ad6c2fed4055
--- /dev/null
+++ b/fs/cifs/fs_context.c
@@ -0,0 +1,221 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2020, Microsoft Corporation.
+ *
+ * Author(s): Steve French <stfrench@microsoft.com>
+ * David Howells <dhowells@redhat.com>
+ */
+
+#include "cifsglob.h"
+#include "cifs_debug.h"
+#include "fs_context.h"
+
+static const match_table_t cifs_smb_version_tokens = {
+ { Smb_1, SMB1_VERSION_STRING },
+ { Smb_20, SMB20_VERSION_STRING},
+ { Smb_21, SMB21_VERSION_STRING },
+ { Smb_30, SMB30_VERSION_STRING },
+ { Smb_302, SMB302_VERSION_STRING },
+ { Smb_302, ALT_SMB302_VERSION_STRING },
+ { Smb_311, SMB311_VERSION_STRING },
+ { Smb_311, ALT_SMB311_VERSION_STRING },
+ { Smb_3any, SMB3ANY_VERSION_STRING },
+ { Smb_default, SMBDEFAULT_VERSION_STRING },
+ { Smb_version_err, NULL }
+};
+
+int
+cifs_parse_smb_version(char *value, struct smb_vol *vol, bool is_smb3)
+{
+ substring_t args[MAX_OPT_ARGS];
+
+ switch (match_token(value, cifs_smb_version_tokens, args)) {
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
+ case Smb_1:
+ if (disable_legacy_dialects) {
+ cifs_dbg(VFS, "mount with legacy dialect disabled\n");
+ return 1;
+ }
+ if (is_smb3) {
+ cifs_dbg(VFS, "vers=1.0 (cifs) not permitted when mounting with smb3\n");
+ return 1;
+ }
+ cifs_dbg(VFS, "Use of the less secure dialect vers=1.0 is not recommended unless required for access to very old servers\n");
+ vol->ops = &smb1_operations;
+ vol->vals = &smb1_values;
+ break;
+ case Smb_20:
+ if (disable_legacy_dialects) {
+ cifs_dbg(VFS, "mount with legacy dialect disabled\n");
+ return 1;
+ }
+ if (is_smb3) {
+ cifs_dbg(VFS, "vers=2.0 not permitted when mounting with smb3\n");
+ return 1;
+ }
+ vol->ops = &smb20_operations;
+ vol->vals = &smb20_values;
+ break;
+#else
+ case Smb_1:
+ cifs_dbg(VFS, "vers=1.0 (cifs) mount not permitted when legacy dialects disabled\n");
+ return 1;
+ case Smb_20:
+ cifs_dbg(VFS, "vers=2.0 mount not permitted when legacy dialects disabled\n");
+ return 1;
+#endif /* CIFS_ALLOW_INSECURE_LEGACY */
+ case Smb_21:
+ vol->ops = &smb21_operations;
+ vol->vals = &smb21_values;
+ break;
+ case Smb_30:
+ vol->ops = &smb30_operations;
+ vol->vals = &smb30_values;
+ break;
+ case Smb_302:
+ vol->ops = &smb30_operations; /* currently identical with 3.0 */
+ vol->vals = &smb302_values;
+ break;
+ case Smb_311:
+ vol->ops = &smb311_operations;
+ vol->vals = &smb311_values;
+ break;
+ case Smb_3any:
+ vol->ops = &smb30_operations; /* currently identical with 3.0 */
+ vol->vals = &smb3any_values;
+ break;
+ case Smb_default:
+ vol->ops = &smb30_operations; /* currently identical with 3.0 */
+ vol->vals = &smbdefault_values;
+ break;
+ default:
+ cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value);
+ return 1;
+ }
+ return 0;
+}
+
+static const match_table_t cifs_secflavor_tokens = {
+ { Opt_sec_krb5, "krb5" },
+ { Opt_sec_krb5i, "krb5i" },
+ { Opt_sec_krb5p, "krb5p" },
+ { Opt_sec_ntlmsspi, "ntlmsspi" },
+ { Opt_sec_ntlmssp, "ntlmssp" },
+ { Opt_ntlm, "ntlm" },
+ { Opt_sec_ntlmi, "ntlmi" },
+ { Opt_sec_ntlmv2, "nontlm" },
+ { Opt_sec_ntlmv2, "ntlmv2" },
+ { Opt_sec_ntlmv2i, "ntlmv2i" },
+ { Opt_sec_lanman, "lanman" },
+ { Opt_sec_none, "none" },
+
+ { Opt_sec_err, NULL }
+};
+
+int cifs_parse_security_flavors(char *value, struct smb_vol *vol)
+{
+
+ substring_t args[MAX_OPT_ARGS];
+
+ /*
+ * With mount options, the last one should win. Reset any existing
+ * settings back to default.
+ */
+ vol->sectype = Unspecified;
+ vol->sign = false;
+
+ switch (match_token(value, cifs_secflavor_tokens, args)) {
+ case Opt_sec_krb5p:
+ cifs_dbg(VFS, "sec=krb5p is not supported!\n");
+ return 1;
+ case Opt_sec_krb5i:
+ vol->sign = true;
+ fallthrough;
+ case Opt_sec_krb5:
+ vol->sectype = Kerberos;
+ break;
+ case Opt_sec_ntlmsspi:
+ vol->sign = true;
+ fallthrough;
+ case Opt_sec_ntlmssp:
+ vol->sectype = RawNTLMSSP;
+ break;
+ case Opt_sec_ntlmi:
+ vol->sign = true;
+ fallthrough;
+ case Opt_ntlm:
+ vol->sectype = NTLM;
+ break;
+ case Opt_sec_ntlmv2i:
+ vol->sign = true;
+ fallthrough;
+ case Opt_sec_ntlmv2:
+ vol->sectype = NTLMv2;
+ break;
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+ case Opt_sec_lanman:
+ vol->sectype = LANMAN;
+ break;
+#endif
+ case Opt_sec_none:
+ vol->nullauth = 1;
+ break;
+ default:
+ cifs_dbg(VFS, "bad security option: %s\n", value);
+ return 1;
+ }
+
+ return 0;
+}
+
+static const match_table_t cifs_cacheflavor_tokens = {
+ { Opt_cache_loose, "loose" },
+ { Opt_cache_strict, "strict" },
+ { Opt_cache_none, "none" },
+ { Opt_cache_ro, "ro" },
+ { Opt_cache_rw, "singleclient" },
+ { Opt_cache_err, NULL }
+};
+
+int
+cifs_parse_cache_flavor(char *value, struct smb_vol *vol)
+{
+ substring_t args[MAX_OPT_ARGS];
+
+ switch (match_token(value, cifs_cacheflavor_tokens, args)) {
+ case Opt_cache_loose:
+ vol->direct_io = false;
+ vol->strict_io = false;
+ vol->cache_ro = false;
+ vol->cache_rw = false;
+ break;
+ case Opt_cache_strict:
+ vol->direct_io = false;
+ vol->strict_io = true;
+ vol->cache_ro = false;
+ vol->cache_rw = false;
+ break;
+ case Opt_cache_none:
+ vol->direct_io = true;
+ vol->strict_io = false;
+ vol->cache_ro = false;
+ vol->cache_rw = false;
+ break;
+ case Opt_cache_ro:
+ vol->direct_io = false;
+ vol->strict_io = false;
+ vol->cache_ro = true;
+ vol->cache_rw = false;
+ break;
+ case Opt_cache_rw:
+ vol->direct_io = false;
+ vol->strict_io = false;
+ vol->cache_ro = false;
+ vol->cache_rw = true;
+ break;
+ default:
+ cifs_dbg(VFS, "bad cache= option: %s\n", value);
+ return 1;
+ }
+ return 0;
+}
diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h
new file mode 100644
index 000000000000..886208a1b0ef
--- /dev/null
+++ b/fs/cifs/fs_context.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2020, Microsoft Corporation.
+ *
+ * Author(s): Steve French <stfrench@microsoft.com>
+ * David Howells <dhowells@redhat.com>
+ */
+
+#ifndef _FS_CONTEXT_H
+#define _FS_CONTEXT_H
+
+#include <linux/parser.h>
+#include "cifsglob.h"
+
+enum smb_version {
+ Smb_1 = 1,
+ Smb_20,
+ Smb_21,
+ Smb_30,
+ Smb_302,
+ Smb_311,
+ Smb_3any,
+ Smb_default,
+ Smb_version_err
+};
+
+int cifs_parse_smb_version(char *value, struct smb_vol *vol, bool is_smb3);
+
+enum {
+ Opt_cache_loose,
+ Opt_cache_strict,
+ Opt_cache_none,
+ Opt_cache_ro,
+ Opt_cache_rw,
+ Opt_cache_err
+};
+
+int cifs_parse_cache_flavor(char *value, struct smb_vol *vol);
+
+enum cifs_sec_param {
+ Opt_sec_krb5,
+ Opt_sec_krb5i,
+ Opt_sec_krb5p,
+ Opt_sec_ntlmsspi,
+ Opt_sec_ntlmssp,
+ Opt_ntlm,
+ Opt_sec_ntlmi,
+ Opt_sec_ntlmv2,
+ Opt_sec_ntlmv2i,
+ Opt_sec_lanman,
+ Opt_sec_none,
+
+ Opt_sec_err
+};
+
+int cifs_parse_security_flavors(char *value, struct smb_vol *vol);
+
+#endif
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 1f75b25e559a..9ee5f304592f 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -656,7 +656,7 @@ smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct smb311_posix_qinfo *
static void
cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
struct super_block *sb, bool adjust_tz,
- bool symlink)
+ bool symlink, u32 reparse_tag)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
@@ -684,8 +684,22 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
fattr->cf_createtime = le64_to_cpu(info->CreationTime);
fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
-
- if (symlink) {
+ if (reparse_tag == IO_REPARSE_TAG_LX_SYMLINK) {
+ fattr->cf_mode |= S_IFLNK | cifs_sb->mnt_file_mode;
+ fattr->cf_dtype = DT_LNK;
+ } else if (reparse_tag == IO_REPARSE_TAG_LX_FIFO) {
+ fattr->cf_mode |= S_IFIFO | cifs_sb->mnt_file_mode;
+ fattr->cf_dtype = DT_FIFO;
+ } else if (reparse_tag == IO_REPARSE_TAG_AF_UNIX) {
+ fattr->cf_mode |= S_IFSOCK | cifs_sb->mnt_file_mode;
+ fattr->cf_dtype = DT_SOCK;
+ } else if (reparse_tag == IO_REPARSE_TAG_LX_CHR) {
+ fattr->cf_mode |= S_IFCHR | cifs_sb->mnt_file_mode;
+ fattr->cf_dtype = DT_CHR;
+ } else if (reparse_tag == IO_REPARSE_TAG_LX_BLK) {
+ fattr->cf_mode |= S_IFBLK | cifs_sb->mnt_file_mode;
+ fattr->cf_dtype = DT_BLK;
+ } else if (symlink) { /* TODO add more reparse tag checks */
fattr->cf_mode = S_IFLNK;
fattr->cf_dtype = DT_LNK;
} else if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
@@ -740,8 +754,9 @@ cifs_get_file_info(struct file *filp)
rc = server->ops->query_file_info(xid, tcon, &cfile->fid, &find_data);
switch (rc) {
case 0:
+ /* TODO: add support to query reparse tag */
cifs_all_info_to_fattr(&fattr, &find_data, inode->i_sb, false,
- false);
+ false, 0 /* no reparse tag */);
break;
case -EREMOTE:
cifs_create_dfs_fattr(&fattr, inode->i_sb);
@@ -910,12 +925,13 @@ cifs_get_inode_info(struct inode **inode,
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
bool adjust_tz = false;
struct cifs_fattr fattr = {0};
- bool symlink = false;
+ bool is_reparse_point = false;
FILE_ALL_INFO *data = in_data;
FILE_ALL_INFO *tmp_data = NULL;
void *smb1_backup_rsp_buf = NULL;
int rc = 0;
int tmprc = 0;
+ __u32 reparse_tag = 0;
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
@@ -938,8 +954,8 @@ cifs_get_inode_info(struct inode **inode,
goto out;
}
rc = server->ops->query_path_info(xid, tcon, cifs_sb,
- full_path, tmp_data,
- &adjust_tz, &symlink);
+ full_path, tmp_data,
+ &adjust_tz, &is_reparse_point);
data = tmp_data;
}
@@ -949,7 +965,19 @@ cifs_get_inode_info(struct inode **inode,
switch (rc) {
case 0:
- cifs_all_info_to_fattr(&fattr, data, sb, adjust_tz, symlink);
+ /*
+ * If the file is a reparse point, it is more complicated
+ * since we have to check if its reparse tag matches a known
+ * special file type e.g. symlink or fifo or char etc.
+ */
+ if ((le32_to_cpu(data->Attributes) & ATTR_REPARSE) &&
+ server->ops->query_reparse_tag) {
+ rc = server->ops->query_reparse_tag(xid, tcon, cifs_sb,
+ full_path, &reparse_tag);
+ cifs_dbg(FYI, "reparse tag 0x%x\n", reparse_tag);
+ }
+ cifs_all_info_to_fattr(&fattr, data, sb, adjust_tz,
+ is_reparse_point, reparse_tag);
break;
case -EREMOTE:
/* DFS link, no metadata available on this server */
@@ -2883,13 +2911,18 @@ cifs_setattr(struct dentry *direntry, struct iattr *attrs)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb);
+ int rc, retries = 0;
- if (pTcon->unix_ext)
- return cifs_setattr_unix(direntry, attrs);
-
- return cifs_setattr_nounix(direntry, attrs);
+ do {
+ if (pTcon->unix_ext)
+ rc = cifs_setattr_unix(direntry, attrs);
+ else
+ rc = cifs_setattr_nounix(direntry, attrs);
+ retries++;
+ } while (is_retryable_error(rc) && retries < 2);
/* BB: add cifs_setattr_legacy for really old servers */
+ return rc;
}
#if 0
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 6df0922e7e30..799be3a5d25e 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -168,10 +168,33 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
fattr->cf_uid = cifs_sb->mnt_uid;
fattr->cf_gid = cifs_sb->mnt_gid;
+ /*
+ * The IO_REPARSE_TAG_LX_ tags originally were used by WSL but they
+ * are preferred by the Linux client in some cases since, unlike
+ * the NFS reparse tag (or EAs), they don't require an extra query
+ * to determine which type of special file they represent.
+ * TODO: go through all documented reparse tags to see if we can
+ * reasonably map some of them to directories vs. files vs. symlinks
+ */
if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
fattr->cf_dtype = DT_DIR;
- } else {
+ } else if (fattr->cf_cifstag == IO_REPARSE_TAG_LX_SYMLINK) {
+ fattr->cf_mode |= S_IFLNK | cifs_sb->mnt_file_mode;
+ fattr->cf_dtype = DT_LNK;
+ } else if (fattr->cf_cifstag == IO_REPARSE_TAG_LX_FIFO) {
+ fattr->cf_mode |= S_IFIFO | cifs_sb->mnt_file_mode;
+ fattr->cf_dtype = DT_FIFO;
+ } else if (fattr->cf_cifstag == IO_REPARSE_TAG_AF_UNIX) {
+ fattr->cf_mode |= S_IFSOCK | cifs_sb->mnt_file_mode;
+ fattr->cf_dtype = DT_SOCK;
+ } else if (fattr->cf_cifstag == IO_REPARSE_TAG_LX_CHR) {
+ fattr->cf_mode |= S_IFCHR | cifs_sb->mnt_file_mode;
+ fattr->cf_dtype = DT_CHR;
+ } else if (fattr->cf_cifstag == IO_REPARSE_TAG_LX_BLK) {
+ fattr->cf_mode |= S_IFBLK | cifs_sb->mnt_file_mode;
+ fattr->cf_dtype = DT_BLK;
+ } else { /* TODO: should we mark some other reparse points (like DFSR) as directories? */
fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode;
fattr->cf_dtype = DT_REG;
}
@@ -267,9 +290,8 @@ cifs_posix_to_fattr(struct cifs_fattr *fattr, struct smb2_posix_info *info,
if (reparse_file_needs_reval(fattr))
fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
- /* TODO map SIDs */
- fattr->cf_uid = cifs_sb->mnt_uid;
- fattr->cf_gid = cifs_sb->mnt_gid;
+ sid_to_id(cifs_sb, &parsed.owner, fattr, SIDOWNER);
+ sid_to_id(cifs_sb, &parsed.group, fattr, SIDGROUP);
}
static void __dir_info_to_fattr(struct cifs_fattr *fattr, const void *info)
@@ -360,11 +382,11 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
*/
static int
-initiate_cifs_search(const unsigned int xid, struct file *file)
+initiate_cifs_search(const unsigned int xid, struct file *file,
+ char *full_path)
{
__u16 search_flags;
int rc = 0;
- char *full_path = NULL;
struct cifsFileInfo *cifsFile;
struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file);
struct tcon_link *tlink = NULL;
@@ -400,12 +422,6 @@ initiate_cifs_search(const unsigned int xid, struct file *file)
cifsFile->invalidHandle = true;
cifsFile->srch_inf.endOfSearch = false;
- full_path = build_path_from_dentry(file_dentry(file));
- if (full_path == NULL) {
- rc = -ENOMEM;
- goto error_exit;
- }
-
cifs_dbg(FYI, "Full path: %s start at: %lld\n", full_path, file->f_pos);
ffirst_retry:
@@ -444,7 +460,6 @@ ffirst_retry:
goto ffirst_retry;
}
error_exit:
- kfree(full_path);
cifs_put_tlink(tlink);
return rc;
}
@@ -688,7 +703,8 @@ static int cifs_save_resume_key(const char *current_entry,
*/
static int
find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
- struct file *file, char **current_entry, int *num_to_ret)
+ struct file *file, char *full_path,
+ char **current_entry, int *num_to_ret)
{
__u16 search_flags;
int rc = 0;
@@ -741,7 +757,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
ntwrk_buf_start);
cfile->srch_inf.ntwrk_buf_start = NULL;
}
- rc = initiate_cifs_search(xid, file);
+ rc = initiate_cifs_search(xid, file, full_path);
if (rc) {
cifs_dbg(FYI, "error %d reinitiating a search on rewind\n",
rc);
@@ -925,15 +941,22 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
char *tmp_buf = NULL;
char *end_of_smb;
unsigned int max_len;
+ char *full_path = NULL;
xid = get_xid();
+ full_path = build_path_from_dentry(file_dentry(file));
+ if (full_path == NULL) {
+ rc = -ENOMEM;
+ goto rddir2_exit;
+ }
+
/*
* Ensure FindFirst doesn't fail before doing filldir() for '.' and
* '..'. Otherwise we won't be able to notify VFS in case of failure.
*/
if (file->private_data == NULL) {
- rc = initiate_cifs_search(xid, file);
+ rc = initiate_cifs_search(xid, file, full_path);
cifs_dbg(FYI, "initiate cifs search rc %d\n", rc);
if (rc)
goto rddir2_exit;
@@ -960,8 +983,8 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
} */
tcon = tlink_tcon(cifsFile->tlink);
- rc = find_cifs_entry(xid, tcon, ctx->pos, file, &current_entry,
- &num_to_fill);
+ rc = find_cifs_entry(xid, tcon, ctx->pos, file, full_path,
+ &current_entry, &num_to_fill);
if (rc) {
cifs_dbg(FYI, "fce error %d\n", rc);
goto rddir2_exit;
@@ -1019,6 +1042,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
kfree(tmp_buf);
rddir2_exit:
+ kfree(full_path);
free_xid(xid);
return rc;
}
diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h
index cf20f0b5d836..99a1951a01ec 100644
--- a/fs/cifs/smb2glob.h
+++ b/fs/cifs/smb2glob.h
@@ -58,6 +58,7 @@
#define SMB2_HMACSHA256_SIZE (32)
#define SMB2_CMACAES_SIZE (16)
#define SMB3_SIGNKEY_SIZE (16)
+#define SMB3_GCM256_CRYPTKEY_SIZE (32)
/* Maximum buffer size value we can send with 1 credit */
#define SMB2_MAX_BUFFER_SIZE 65536
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index eba01d0908dd..1f900b81c34a 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -506,17 +506,17 @@ move_smb2_info_to_cifs(FILE_ALL_INFO *dst, struct smb2_file_all_info *src)
int
smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path,
- FILE_ALL_INFO *data, bool *adjust_tz, bool *symlink)
+ FILE_ALL_INFO *data, bool *adjust_tz, bool *reparse)
{
int rc;
struct smb2_file_all_info *smb2_data;
__u32 create_options = 0;
- struct cifs_fid fid;
bool no_cached_open = tcon->nohandlecache;
struct cifsFileInfo *cfile;
+ struct cached_fid *cfid = NULL;
*adjust_tz = false;
- *symlink = false;
+ *reparse = false;
smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
GFP_KERNEL);
@@ -525,7 +525,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
/* If it is a root and its handle is cached then use it */
if (!strlen(full_path) && !no_cached_open) {
- rc = open_shroot(xid, tcon, cifs_sb, &fid);
+ rc = open_shroot(xid, tcon, cifs_sb, &cfid);
if (rc)
goto out;
@@ -533,12 +533,13 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
move_smb2_info_to_cifs(data,
&tcon->crfid.file_all_info);
} else {
- rc = SMB2_query_info(xid, tcon, fid.persistent_fid,
- fid.volatile_fid, smb2_data);
+ rc = SMB2_query_info(xid, tcon,
+ cfid->fid->persistent_fid,
+ cfid->fid->volatile_fid, smb2_data);
if (!rc)
move_smb2_info_to_cifs(data, smb2_data);
}
- close_shroot(&tcon->crfid);
+ close_shroot(cfid);
goto out;
}
@@ -547,7 +548,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
FILE_READ_ATTRIBUTES, FILE_OPEN, create_options,
ACL_NO_MODE, smb2_data, SMB2_OP_QUERY_INFO, cfile);
if (rc == -EOPNOTSUPP) {
- *symlink = true;
+ *reparse = true;
create_options |= OPEN_REPARSE_POINT;
/* Failed on a symbolic link - query a reparse point info */
@@ -569,7 +570,7 @@ out:
int
smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path,
- struct smb311_posix_qinfo *data, bool *adjust_tz, bool *symlink)
+ struct smb311_posix_qinfo *data, bool *adjust_tz, bool *reparse)
{
int rc;
__u32 create_options = 0;
@@ -577,7 +578,7 @@ smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
struct smb311_posix_qinfo *smb2_data;
*adjust_tz = false;
- *symlink = false;
+ *reparse = false;
/* BB TODO: Make struct larger when add support for parsing owner SIDs */
smb2_data = kzalloc(sizeof(struct smb311_posix_qinfo),
@@ -598,7 +599,7 @@ smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
ACL_NO_MODE, smb2_data, SMB2_OP_POSIX_QUERY_INFO, cfile);
if (rc == -EOPNOTSUPP) {
/* BB TODO: When support for special files added to Samba re-verify this path */
- *symlink = true;
+ *reparse = true;
create_options |= OPEN_REPARSE_POINT;
/* Failed on a symbolic link - query a reparse point info */
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index 7fde3775cb57..c775682ee973 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -488,7 +488,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
{STATUS_PIPE_CONNECTED, -EIO, "STATUS_PIPE_CONNECTED"},
{STATUS_PIPE_LISTENING, -EIO, "STATUS_PIPE_LISTENING"},
{STATUS_INVALID_READ_MODE, -EIO, "STATUS_INVALID_READ_MODE"},
- {STATUS_IO_TIMEOUT, -ETIMEDOUT, "STATUS_IO_TIMEOUT"},
+ {STATUS_IO_TIMEOUT, -EAGAIN, "STATUS_IO_TIMEOUT"},
{STATUS_FILE_FORCED_CLOSED, -EIO, "STATUS_FILE_FORCED_CLOSED"},
{STATUS_PROFILING_NOT_STARTED, -EIO, "STATUS_PROFILING_NOT_STARTED"},
{STATUS_PROFILING_NOT_STOPPED, -EIO, "STATUS_PROFILING_NOT_STOPPED"},
@@ -814,7 +814,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
{STATUS_INVALID_VARIANT, -EIO, "STATUS_INVALID_VARIANT"},
{STATUS_DOMAIN_CONTROLLER_NOT_FOUND, -EIO,
"STATUS_DOMAIN_CONTROLLER_NOT_FOUND"},
- {STATUS_ACCOUNT_LOCKED_OUT, -EIO, "STATUS_ACCOUNT_LOCKED_OUT"},
+ {STATUS_ACCOUNT_LOCKED_OUT, -EACCES, "STATUS_ACCOUNT_LOCKED_OUT"},
{STATUS_HANDLE_NOT_CLOSABLE, -EIO, "STATUS_HANDLE_NOT_CLOSABLE"},
{STATUS_CONNECTION_REFUSED, -EIO, "STATUS_CONNECTION_REFUSED"},
{STATUS_GRACEFUL_DISCONNECT, -EIO, "STATUS_GRACEFUL_DISCONNECT"},
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index d44df8f95bcd..504766cb6c19 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -72,7 +72,7 @@ smb2_add_credits(struct TCP_Server_Info *server,
/* eg found case where write overlapping reconnect messed up credits */
if (((optype & CIFS_OP_MASK) == CIFS_NEG_OP) && (*val != 0))
trace_smb3_reconnect_with_invalid_credits(server->CurrentMid,
- server->hostname, *val);
+ server->hostname, *val, add);
if ((instance == 0) || (instance == server->reconnect_instance))
*val += add;
else
@@ -121,6 +121,8 @@ smb2_add_credits(struct TCP_Server_Info *server,
cifs_dbg(FYI, "disabling oplocks\n");
break;
default:
+ trace_smb3_add_credits(server->CurrentMid,
+ server->hostname, rc, add);
cifs_dbg(FYI, "add %u credits total=%d\n", add, rc);
}
}
@@ -651,7 +653,8 @@ smb2_cached_lease_break(struct work_struct *work)
* Open the directory at the root of a share
*/
int open_shroot(unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb, struct cifs_fid *pfid)
+ struct cifs_sb_info *cifs_sb,
+ struct cached_fid **cfid)
{
struct cifs_ses *ses = tcon->ses;
struct TCP_Server_Info *server = ses->server;
@@ -666,11 +669,12 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon,
int rc, flags = 0;
__le16 utf16_path = 0; /* Null - since an open of top of share */
u8 oplock = SMB2_OPLOCK_LEVEL_II;
+ struct cifs_fid *pfid;
mutex_lock(&tcon->crfid.fid_mutex);
if (tcon->crfid.is_valid) {
cifs_dbg(FYI, "found a cached root file handle\n");
- memcpy(pfid, tcon->crfid.fid, sizeof(struct cifs_fid));
+ *cfid = &tcon->crfid;
kref_get(&tcon->crfid.refcount);
mutex_unlock(&tcon->crfid.fid_mutex);
return 0;
@@ -691,6 +695,7 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon,
if (!server->ops->new_lease_key)
return -EIO;
+ pfid = tcon->crfid.fid;
server->ops->new_lease_key(pfid);
memset(rqst, 0, sizeof(rqst));
@@ -820,6 +825,8 @@ oshr_free:
SMB2_query_info_free(&rqst[1]);
free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
+ if (rc == 0)
+ *cfid = &tcon->crfid;
return rc;
}
@@ -833,6 +840,7 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_open_parms oparms;
struct cifs_fid fid;
bool no_cached_open = tcon->nohandlecache;
+ struct cached_fid *cfid = NULL;
oparms.tcon = tcon;
oparms.desired_access = FILE_READ_ATTRIBUTES;
@@ -841,12 +849,14 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon,
oparms.fid = &fid;
oparms.reconnect = false;
- if (no_cached_open)
+ if (no_cached_open) {
rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL,
NULL, NULL);
- else
- rc = open_shroot(xid, tcon, cifs_sb, &fid);
-
+ } else {
+ rc = open_shroot(xid, tcon, cifs_sb, &cfid);
+ if (rc == 0)
+ memcpy(&fid, cfid->fid, sizeof(struct cifs_fid));
+ }
if (rc)
return;
@@ -863,7 +873,7 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon,
if (no_cached_open)
SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
else
- close_shroot(&tcon->crfid);
+ close_shroot(cfid);
}
static void
@@ -2346,6 +2356,17 @@ smb2_is_session_expired(char *buf)
return true;
}
+static bool
+smb2_is_status_io_timeout(char *buf)
+{
+ struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+
+ if (shdr->Status == STATUS_IO_TIMEOUT)
+ return true;
+ else
+ return false;
+}
+
static int
smb2_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid,
struct cifsInodeInfo *cinode)
@@ -3013,6 +3034,133 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
return rc;
}
+int
+smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const char *full_path,
+ __u32 *tag)
+{
+ int rc;
+ __le16 *utf16_path = NULL;
+ __u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+ struct cifs_open_parms oparms;
+ struct cifs_fid fid;
+ struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses);
+ int flags = 0;
+ struct smb_rqst rqst[3];
+ int resp_buftype[3];
+ struct kvec rsp_iov[3];
+ struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
+ struct kvec io_iov[SMB2_IOCTL_IOV_SIZE];
+ struct kvec close_iov[1];
+ struct smb2_ioctl_rsp *ioctl_rsp;
+ struct reparse_data_buffer *reparse_buf;
+ u32 plen;
+
+ cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
+
+ if (smb3_encryption_required(tcon))
+ flags |= CIFS_TRANSFORM_REQ;
+
+ memset(rqst, 0, sizeof(rqst));
+ resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
+ memset(rsp_iov, 0, sizeof(rsp_iov));
+
+ utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
+ if (!utf16_path)
+ return -ENOMEM;
+
+ /*
+ * setup smb2open - TODO add optimization to call cifs_get_readable_path
+ * to see if there is a handle already open that we can use
+ */
+ memset(&open_iov, 0, sizeof(open_iov));
+ rqst[0].rq_iov = open_iov;
+ rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
+
+ memset(&oparms, 0, sizeof(oparms));
+ oparms.tcon = tcon;
+ oparms.desired_access = FILE_READ_ATTRIBUTES;
+ oparms.disposition = FILE_OPEN;
+ oparms.create_options = cifs_create_options(cifs_sb, OPEN_REPARSE_POINT);
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ rc = SMB2_open_init(tcon, server,
+ &rqst[0], &oplock, &oparms, utf16_path);
+ if (rc)
+ goto query_rp_exit;
+ smb2_set_next_command(tcon, &rqst[0]);
+
+
+ /* IOCTL */
+ memset(&io_iov, 0, sizeof(io_iov));
+ rqst[1].rq_iov = io_iov;
+ rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE;
+
+ rc = SMB2_ioctl_init(tcon, server,
+ &rqst[1], fid.persistent_fid,
+ fid.volatile_fid, FSCTL_GET_REPARSE_POINT,
+ true /* is_fctl */, NULL, 0,
+ CIFSMaxBufSize -
+ MAX_SMB2_CREATE_RESPONSE_SIZE -
+ MAX_SMB2_CLOSE_RESPONSE_SIZE);
+ if (rc)
+ goto query_rp_exit;
+
+ smb2_set_next_command(tcon, &rqst[1]);
+ smb2_set_related(&rqst[1]);
+
+
+ /* Close */
+ memset(&close_iov, 0, sizeof(close_iov));
+ rqst[2].rq_iov = close_iov;
+ rqst[2].rq_nvec = 1;
+
+ rc = SMB2_close_init(tcon, server,
+ &rqst[2], COMPOUND_FID, COMPOUND_FID, false);
+ if (rc)
+ goto query_rp_exit;
+
+ smb2_set_related(&rqst[2]);
+
+ rc = compound_send_recv(xid, tcon->ses, server,
+ flags, 3, rqst,
+ resp_buftype, rsp_iov);
+
+ ioctl_rsp = rsp_iov[1].iov_base;
+
+ /*
+ * Open was successful and we got an ioctl response.
+ */
+ if (rc == 0) {
+ /* See MS-FSCC 2.3.23 */
+
+ reparse_buf = (struct reparse_data_buffer *)
+ ((char *)ioctl_rsp +
+ le32_to_cpu(ioctl_rsp->OutputOffset));
+ plen = le32_to_cpu(ioctl_rsp->OutputCount);
+
+ if (plen + le32_to_cpu(ioctl_rsp->OutputOffset) >
+ rsp_iov[1].iov_len) {
+ cifs_tcon_dbg(FYI, "srv returned invalid ioctl len: %d\n",
+ plen);
+ rc = -EIO;
+ goto query_rp_exit;
+ }
+ *tag = le32_to_cpu(reparse_buf->ReparseTag);
+ }
+
+ query_rp_exit:
+ kfree(utf16_path);
+ SMB2_open_free(&rqst[0]);
+ SMB2_ioctl_free(&rqst[1]);
+ SMB2_close_free(&rqst[2]);
+ free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
+ free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
+ free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
+ return rc;
+}
+
static struct cifs_ntsd *
get_smb2_acl_by_fid(struct cifs_sb_info *cifs_sb,
const struct cifs_fid *cifsfid, u32 *pacllen)
@@ -3072,7 +3220,12 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb,
oparms.tcon = tcon;
oparms.desired_access = READ_CONTROL;
oparms.disposition = FILE_OPEN;
- oparms.create_options = cifs_create_options(cifs_sb, 0);
+ /*
+ * When querying an ACL, even if the file is a symlink we want to open
+ * the source not the target, and so the protocol requires that the
+ * client specify this flag when opening a reparse point
+ */
+ oparms.create_options = cifs_create_options(cifs_sb, 0) | OPEN_REPARSE_POINT;
oparms.fid = &fid;
oparms.reconnect = false;
@@ -3801,10 +3954,11 @@ fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len,
tr_hdr->ProtocolId = SMB2_TRANSFORM_PROTO_NUM;
tr_hdr->OriginalMessageSize = cpu_to_le32(orig_len);
tr_hdr->Flags = cpu_to_le16(0x01);
- if (cipher_type == SMB2_ENCRYPTION_AES128_GCM)
- get_random_bytes(&tr_hdr->Nonce, SMB3_AES128GCM_NONCE);
+ if ((cipher_type == SMB2_ENCRYPTION_AES128_GCM) ||
+ (cipher_type == SMB2_ENCRYPTION_AES256_GCM))
+ get_random_bytes(&tr_hdr->Nonce, SMB3_AES_GCM_NONCE);
else
- get_random_bytes(&tr_hdr->Nonce, SMB3_AES128CCM_NONCE);
+ get_random_bytes(&tr_hdr->Nonce, SMB3_AES_CCM_NONCE);
memcpy(&tr_hdr->SessionId, &shdr->SessionId, 8);
}
@@ -3924,7 +4078,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
if (rc) {
cifs_server_dbg(VFS, "%s: Could not get %scryption key\n", __func__,
enc ? "en" : "de");
- return 0;
+ return rc;
}
rc = smb3_crypto_aead_allocate(server);
@@ -3935,7 +4089,12 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
tfm = enc ? server->secmech.ccmaesencrypt :
server->secmech.ccmaesdecrypt;
- rc = crypto_aead_setkey(tfm, key, SMB3_SIGN_KEY_SIZE);
+
+ if (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)
+ rc = crypto_aead_setkey(tfm, key, SMB3_GCM256_CRYPTKEY_SIZE);
+ else
+ rc = crypto_aead_setkey(tfm, key, SMB3_SIGN_KEY_SIZE);
+
if (rc) {
cifs_server_dbg(VFS, "%s: Failed to set aead key %d\n", __func__, rc);
return rc;
@@ -3973,11 +4132,12 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
goto free_sg;
}
- if (server->cipher_type == SMB2_ENCRYPTION_AES128_GCM)
- memcpy(iv, (char *)tr_hdr->Nonce, SMB3_AES128GCM_NONCE);
+ if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) ||
+ (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM))
+ memcpy(iv, (char *)tr_hdr->Nonce, SMB3_AES_GCM_NONCE);
else {
iv[0] = 3;
- memcpy(iv + 1, (char *)tr_hdr->Nonce, SMB3_AES128CCM_NONCE);
+ memcpy(iv + 1, (char *)tr_hdr->Nonce, SMB3_AES_CCM_NONCE);
}
aead_request_set_crypt(req, sg, sg, crypt_len, iv);
@@ -4103,7 +4263,8 @@ smb3_is_transform_hdr(void *buf)
static int
decrypt_raw_data(struct TCP_Server_Info *server, char *buf,
unsigned int buf_data_size, struct page **pages,
- unsigned int npages, unsigned int page_data_size)
+ unsigned int npages, unsigned int page_data_size,
+ bool is_offloaded)
{
struct kvec iov[2];
struct smb_rqst rqst = {NULL};
@@ -4129,7 +4290,8 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf,
memmove(buf, iov[1].iov_base, buf_data_size);
- server->total_read = buf_data_size + page_data_size;
+ if (!is_offloaded)
+ server->total_read = buf_data_size + page_data_size;
return rc;
}
@@ -4342,7 +4504,7 @@ static void smb2_decrypt_offload(struct work_struct *work)
struct mid_q_entry *mid;
rc = decrypt_raw_data(dw->server, dw->buf, dw->server->vals->read_rsp_size,
- dw->ppages, dw->npages, dw->len);
+ dw->ppages, dw->npages, dw->len, true);
if (rc) {
cifs_dbg(VFS, "error decrypting rc=%d\n", rc);
goto free_pages;
@@ -4448,7 +4610,7 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid,
non_offloaded_decrypt:
rc = decrypt_raw_data(server, buf, server->vals->read_rsp_size,
- pages, npages, len);
+ pages, npages, len, false);
if (rc)
goto free_pages;
@@ -4504,7 +4666,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server,
server->total_read += length;
buf_size = pdu_length - sizeof(struct smb2_transform_hdr);
- length = decrypt_raw_data(server, buf, buf_size, NULL, 0, 0);
+ length = decrypt_raw_data(server, buf, buf_size, NULL, 0, 0, false);
if (length)
return length;
@@ -4809,6 +4971,7 @@ struct smb_version_operations smb20_operations = {
.make_node = smb2_make_node,
.fiemap = smb3_fiemap,
.llseek = smb3_llseek,
+ .is_status_io_timeout = smb2_is_status_io_timeout,
};
struct smb_version_operations smb21_operations = {
@@ -4909,6 +5072,7 @@ struct smb_version_operations smb21_operations = {
.make_node = smb2_make_node,
.fiemap = smb3_fiemap,
.llseek = smb3_llseek,
+ .is_status_io_timeout = smb2_is_status_io_timeout,
};
struct smb_version_operations smb30_operations = {
@@ -4949,6 +5113,8 @@ struct smb_version_operations smb30_operations = {
.can_echo = smb2_can_echo,
.echo = SMB2_echo,
.query_path_info = smb2_query_path_info,
+ /* WSL tags introduced long after smb2.1, enable for SMB3, 3.11 only */
+ .query_reparse_tag = smb2_query_reparse_tag,
.get_srv_inum = smb2_get_srv_inum,
.query_file_info = smb2_query_file_info,
.set_path_size = smb2_set_path_size,
@@ -5019,6 +5185,7 @@ struct smb_version_operations smb30_operations = {
.make_node = smb2_make_node,
.fiemap = smb3_fiemap,
.llseek = smb3_llseek,
+ .is_status_io_timeout = smb2_is_status_io_timeout,
};
struct smb_version_operations smb311_operations = {
@@ -5059,6 +5226,7 @@ struct smb_version_operations smb311_operations = {
.can_echo = smb2_can_echo,
.echo = SMB2_echo,
.query_path_info = smb2_query_path_info,
+ .query_reparse_tag = smb2_query_reparse_tag,
.get_srv_inum = smb2_get_srv_inum,
.query_file_info = smb2_query_file_info,
.set_path_size = smb2_set_path_size,
@@ -5130,6 +5298,7 @@ struct smb_version_operations smb311_operations = {
.make_node = smb2_make_node,
.fiemap = smb3_fiemap,
.llseek = smb3_llseek,
+ .is_status_io_timeout = smb2_is_status_io_timeout,
};
struct smb_version_values smb20_values = {
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 96c172d94fba..445e80862865 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -449,10 +449,22 @@ static void
build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt)
{
pneg_ctxt->ContextType = SMB2_ENCRYPTION_CAPABILITIES;
- pneg_ctxt->DataLength = cpu_to_le16(6); /* Cipher Count + two ciphers */
- pneg_ctxt->CipherCount = cpu_to_le16(2);
- pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_GCM;
- pneg_ctxt->Ciphers[1] = SMB2_ENCRYPTION_AES128_CCM;
+ if (require_gcm_256) {
+ pneg_ctxt->DataLength = cpu_to_le16(4); /* Cipher Count + 1 cipher */
+ pneg_ctxt->CipherCount = cpu_to_le16(1);
+ pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES256_GCM;
+ } else if (enable_gcm_256) {
+ pneg_ctxt->DataLength = cpu_to_le16(8); /* Cipher Count + 3 ciphers */
+ pneg_ctxt->CipherCount = cpu_to_le16(3);
+ pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_GCM;
+ pneg_ctxt->Ciphers[1] = SMB2_ENCRYPTION_AES256_GCM;
+ pneg_ctxt->Ciphers[2] = SMB2_ENCRYPTION_AES128_CCM;
+ } else {
+ pneg_ctxt->DataLength = cpu_to_le16(6); /* Cipher Count + 2 ciphers */
+ pneg_ctxt->CipherCount = cpu_to_le16(2);
+ pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_GCM;
+ pneg_ctxt->Ciphers[1] = SMB2_ENCRYPTION_AES128_CCM;
+ }
}
static unsigned int
@@ -598,8 +610,29 @@ static int decode_encrypt_ctx(struct TCP_Server_Info *server,
return -EINVAL;
}
cifs_dbg(FYI, "SMB311 cipher type:%d\n", le16_to_cpu(ctxt->Ciphers[0]));
- if ((ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES128_CCM) &&
- (ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES128_GCM)) {
+ if (require_gcm_256) {
+ if (ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES256_GCM) {
+ cifs_dbg(VFS, "Server does not support requested encryption type (AES256 GCM)\n");
+ return -EOPNOTSUPP;
+ }
+ } else if (ctxt->Ciphers[0] == 0) {
+ /*
+ * e.g. if server only supported AES256_CCM (very unlikely)
+ * or server supported no encryption types or had all disabled.
+ * Since GLOBAL_CAP_ENCRYPTION will be not set, in the case
+ * in which mount requested encryption ("seal") checks later
+ * on during tree connection will return proper rc, but if
+ * seal not requested by client, since server is allowed to
+ * return 0 to indicate no supported cipher, we can't fail here
+ */
+ server->cipher_type = 0;
+ server->capabilities &= ~SMB2_GLOBAL_CAP_ENCRYPTION;
+ pr_warn_once("Server does not support requested encryption types\n");
+ return 0;
+ } else if ((ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES128_CCM) &&
+ (ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES128_GCM) &&
+ (ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES256_GCM)) {
+ /* server returned a cipher we didn't ask for */
pr_warn_once("Invalid SMB3.11 cipher returned\n");
return -EINVAL;
}
@@ -1948,9 +1981,11 @@ smb2_parse_contexts(struct TCP_Server_Info *server,
unsigned int next;
unsigned int remaining;
char *name;
- const char smb3_create_tag_posix[] = {0x93, 0xAD, 0x25, 0x50, 0x9C,
- 0xB4, 0x11, 0xE7, 0xB4, 0x23, 0x83,
- 0xDE, 0x96, 0x8B, 0xCD, 0x7C};
+ static const char smb3_create_tag_posix[] = {
+ 0x93, 0xAD, 0x25, 0x50, 0x9C,
+ 0xB4, 0x11, 0xE7, 0xB4, 0x23, 0x83,
+ 0xDE, 0x96, 0x8B, 0xCD, 0x7C
+ };
*oplock = 0;
data_offset = (char *)rsp + le32_to_cpu(rsp->CreateContextsOffset);
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index c3f1baf5bde2..f05f9b12f689 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -128,8 +128,8 @@ struct smb2_sync_pdu {
__le16 StructureSize2; /* size of wct area (varies, request specific) */
} __packed;
-#define SMB3_AES128CCM_NONCE 11
-#define SMB3_AES128GCM_NONCE 12
+#define SMB3_AES_CCM_NONCE 11
+#define SMB3_AES_GCM_NONCE 12
/* Transform flags (for 3.0 dialect this flag indicates CCM */
#define TRANSFORM_FLAG_ENCRYPTED 0x0001
@@ -153,10 +153,14 @@ struct smb2_compression_transform_hdr {
} __packed;
/* See MS-SMB2 2.2.42.1 */
+#define SMB2_COMPRESSION_FLAG_NONE 0x0000
+#define SMB2_COMPRESSION_FLAG_CHAINED 0x0001
+
struct compression_payload_header {
- __le16 AlgorithmId;
- __le16 Reserved;
- __le32 Length;
+ __le16 CompressionAlgorithm;
+ __le16 Flags;
+ __le32 Length; /* length of compressed playload including field below if present */
+ /* __le32 OriginalPayloadSize; */ /* optional */
} __packed;
/* See MS-SMB2 2.2.42.2 */
@@ -167,6 +171,26 @@ struct compression_pattern_payload_v1 {
__le32 Repetitions;
} __packed;
+/* See MS-SMB2 2.2.43 */
+struct smb2_rdma_transform {
+ __le16 RdmaDescriptorOffset;
+ __le16 RdmaDescriptorLength;
+ __le32 Channel; /* for values see channel description in smb2 read above */
+ __le16 TransformCount;
+ __le16 Reserved1;
+ __le32 Reserved2;
+} __packed;
+
+struct smb2_rdma_encryption_transform {
+ __le16 TransformType;
+ __le16 SignatureLength;
+ __le16 NonceLength;
+ __u16 Reserved;
+ __u8 Signature[]; /* variable length */
+ /* u8 Nonce[] */
+ /* followed by padding */
+} __packed;
+
/*
* SMB2 flag definitions
*/
@@ -297,6 +321,9 @@ struct smb2_negotiate_req {
#define SMB2_ENCRYPTION_CAPABILITIES cpu_to_le16(2)
#define SMB2_COMPRESSION_CAPABILITIES cpu_to_le16(3)
#define SMB2_NETNAME_NEGOTIATE_CONTEXT_ID cpu_to_le16(5)
+#define SMB2_TRANSPORT_CAPABILITIES cpu_to_le16(6)
+#define SMB2_RDMA_TRANSFORM_CAPABILITIES cpu_to_le16(7)
+#define SMB2_SIGNING_CAPABILITIES cpu_to_le16(8)
#define SMB2_POSIX_EXTENSIONS_AVAILABLE cpu_to_le16(0x100)
struct smb2_neg_context {
@@ -325,6 +352,9 @@ struct smb2_preauth_neg_context {
/* Encryption Algorithms Ciphers */
#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001)
#define SMB2_ENCRYPTION_AES128_GCM cpu_to_le16(0x0002)
+/* we currently do not request AES256_CCM since presumably GCM faster */
+#define SMB2_ENCRYPTION_AES256_CCM cpu_to_le16(0x0003)
+#define SMB2_ENCRYPTION_AES256_GCM cpu_to_le16(0x0004)
/* Min encrypt context data is one cipher so 2 bytes + 2 byte count field */
#define MIN_ENCRYPT_CTXT_DATA_LEN 4
@@ -332,8 +362,9 @@ struct smb2_encryption_neg_context {
__le16 ContextType; /* 2 */
__le16 DataLength;
__le32 Reserved;
- __le16 CipherCount; /* AES-128-GCM and AES-128-CCM */
- __le16 Ciphers[2];
+ /* CipherCount usally 2, but can be 3 when AES256-GCM enabled */
+ __le16 CipherCount; /* AES128-GCM and AES128-CCM by default */
+ __le16 Ciphers[3];
} __packed;
/* See MS-SMB2 2.2.3.1.3 */
@@ -351,10 +382,10 @@ struct smb2_encryption_neg_context {
struct smb2_compression_capabilities_context {
__le16 ContextType; /* 3 */
__le16 DataLength;
- __u32 Flags;
+ __u32 Reserved;
__le16 CompressionAlgorithmCount;
__u16 Padding;
- __u32 Reserved1;
+ __u32 Flags;
__le16 CompressionAlgorithms[3];
} __packed;
@@ -363,12 +394,44 @@ struct smb2_compression_capabilities_context {
* Its struct simply contains NetName, an array of Unicode characters
*/
struct smb2_netname_neg_context {
- __le16 ContextType; /* 0x100 */
+ __le16 ContextType; /* 5 */
__le16 DataLength;
__le32 Reserved;
__le16 NetName[]; /* hostname of target converted to UCS-2 */
} __packed;
+/*
+ * For rdma transform capabilities context see MS-SMB2 2.2.3.1.6
+ * and 2.2.4.1.5
+ */
+
+/* RDMA Transform IDs */
+#define SMB2_RDMA_TRANSFORM_NONE 0x0000
+#define SMB2_RDMA_TRANSFORM_ENCRYPTION 0x0001
+
+struct smb2_rdma_transform_capabilities_context {
+ __le16 ContextType; /* 7 */
+ __le16 DataLength;
+ __u32 Reserved;
+ __le16 TransformCount;
+ __u16 Reserved1;
+ __u32 Reserved2;
+ __le16 RDMATransformIds[1];
+} __packed;
+
+/* Signing algorithms */
+#define SIGNING_ALG_HMAC_SHA256 0
+#define SIGNING_ALG_AES_CMAC 1
+#define SIGNING_ALG_AES_GMAC 2
+
+struct smb2_signing_capabilities {
+ __le16 ContextType; /* 8 */
+ __le16 DataLength;
+ __u32 Reserved;
+ __le16 SigningAlgorithmCount;
+ __le16 SigningAlgorithms[];
+} __packed;
+
#define POSIX_CTXT_DATA_LEN 16
struct smb2_posix_neg_context {
__le16 ContextType; /* 0x100 */
@@ -936,6 +999,31 @@ struct copychunk_ioctl_rsp {
__le32 TotalBytesWritten;
} __packed;
+/* See MS-FSCC 2.3.29 and 2.3.30 */
+struct get_retrieval_pointer_count_req {
+ __le64 StartingVcn; /* virtual cluster number (signed) */
+} __packed;
+
+struct get_retrieval_pointer_count_rsp {
+ __le32 ExtentCount;
+} __packed;
+
+/*
+ * See MS-FSCC 2.3.33 and 2.3.34
+ * request is the same as get_retrieval_point_count_req struct above
+ */
+struct smb3_extents {
+ __le64 NextVcn;
+ __le64 Lcn; /* logical cluster number */
+} __packed;
+
+struct get_retrieval_pointers_refcount_rsp {
+ __le32 ExtentCount;
+ __u32 Reserved;
+ __le64 StartingVcn;
+ struct smb3_extents extents[];
+} __packed;
+
struct fsctl_set_integrity_information_req {
__le16 ChecksumAlgorithm;
__le16 Reserved;
@@ -1178,6 +1266,7 @@ struct smb2_flush_rsp {
#define SMB2_CHANNEL_NONE cpu_to_le32(0x00000000)
#define SMB2_CHANNEL_RDMA_V1 cpu_to_le32(0x00000001) /* SMB3 or later */
#define SMB2_CHANNEL_RDMA_V1_INVALIDATE cpu_to_le32(0x00000002) /* >= SMB3.02 */
+#define SMB2_CHANNEL_RDMA_TRANSFORM cpu_to_le32(0x00000003) /* >= SMB3.02, only used on write */
/* SMB2 read request without RFC1001 length at the beginning */
struct smb2_read_plain_req {
@@ -1197,6 +1286,10 @@ struct smb2_read_plain_req {
__u8 Buffer[1];
} __packed;
+/* Read flags */
+#define SMB2_READFLAG_RESPONSE_NONE 0x00000000
+#define SMB2_READFLAG_RESPONSE_RDMA_TRANSFORM 0x00000001
+
struct smb2_read_rsp {
struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 17 */
@@ -1204,7 +1297,7 @@ struct smb2_read_rsp {
__u8 Reserved;
__le32 DataLength;
__le32 DataRemaining;
- __u32 Reserved2;
+ __u32 Flags;
__u8 Buffer[1];
} __packed;
@@ -1572,6 +1665,7 @@ struct smb2_file_rename_info { /* encoding of request for level 10 */
__u64 RootDirectory; /* MBZ for network operations (why says spec?) */
__le32 FileNameLength;
char FileName[]; /* New name to be assigned */
+ /* padding - overall struct size must be >= 24 so filename + pad >= 6 */
} __packed; /* level 10 Set */
struct smb2_file_link_info { /* encoding of request for level 11 */
@@ -1623,6 +1717,11 @@ struct smb2_file_eof_info { /* encoding of request for level 10 */
__le64 EndOfFile; /* new end of file value */
} __packed; /* level 20 Set */
+struct smb2_file_reparse_point_info {
+ __le64 IndexNumber;
+ __le32 Tag;
+} __packed;
+
struct smb2_file_network_open_info {
__le64 CreationTime;
__le64 LastAccessTime;
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 2f8ecbf54214..d4110447ee3a 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -70,12 +70,16 @@ extern int smb3_handle_read_data(struct TCP_Server_Info *server,
struct mid_q_entry *mid);
extern int open_shroot(unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb, struct cifs_fid *pfid);
+ struct cifs_sb_info *cifs_sb,
+ struct cached_fid **cfid);
extern void close_shroot(struct cached_fid *cfid);
extern void close_shroot_lease(struct cached_fid *cfid);
extern void close_shroot_lease_locked(struct cached_fid *cfid);
extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst,
struct smb2_file_all_info *src);
+extern int smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const char *path,
+ __u32 *reparse_tag);
extern int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb,
const char *full_path, FILE_ALL_INFO *data,
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index c0348e3b1695..ebccd71cc60a 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -849,12 +849,13 @@ smb3_crypto_aead_allocate(struct TCP_Server_Info *server)
struct crypto_aead *tfm;
if (!server->secmech.ccmaesencrypt) {
- if (server->cipher_type == SMB2_ENCRYPTION_AES128_GCM)
+ if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) ||
+ (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM))
tfm = crypto_alloc_aead("gcm(aes)", 0, 0);
else
tfm = crypto_alloc_aead("ccm(aes)", 0, 0);
if (IS_ERR(tfm)) {
- cifs_server_dbg(VFS, "%s: Failed to alloc encrypt aead\n",
+ cifs_server_dbg(VFS, "%s: Failed alloc encrypt aead\n",
__func__);
return PTR_ERR(tfm);
}
@@ -862,7 +863,8 @@ smb3_crypto_aead_allocate(struct TCP_Server_Info *server)
}
if (!server->secmech.ccmaesdecrypt) {
- if (server->cipher_type == SMB2_ENCRYPTION_AES128_GCM)
+ if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) ||
+ (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM))
tfm = crypto_alloc_aead("gcm(aes)", 0, 0);
else
tfm = crypto_alloc_aead("ccm(aes)", 0, 0);
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
index 1ff28529cf4b..a0e84747f567 100644
--- a/fs/cifs/smbfsctl.h
+++ b/fs/cifs/smbfsctl.h
@@ -103,6 +103,8 @@
#define FSCTL_SET_ZERO_ON_DEALLOC 0x00090194 /* BB add struct */
#define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */
#define FSCTL_GET_INTEGRITY_INFORMATION 0x0009027C
+#define FSCTL_GET_RETRIEVAL_POINTERS_AND_REFCOUNT 0x000903d3
+#define FSCTL_GET_RETRIEVAL_POINTER_COUNT 0x0009042b
#define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF
#define FSCTL_SET_DEFECT_MANAGEMENT 0x00098134 /* BB add struct */
#define FSCTL_FILE_LEVEL_TRIM 0x00098208 /* BB add struct */
diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h
index eef4b08c7208..90e0fab69bb8 100644
--- a/fs/cifs/trace.h
+++ b/fs/cifs/trace.h
@@ -878,33 +878,39 @@ DEFINE_SMB3_RECONNECT_EVENT(partial_send_reconnect);
DECLARE_EVENT_CLASS(smb3_credit_class,
TP_PROTO(__u64 currmid,
char *hostname,
- int credits),
- TP_ARGS(currmid, hostname, credits),
+ int credits,
+ int credits_to_add),
+ TP_ARGS(currmid, hostname, credits, credits_to_add),
TP_STRUCT__entry(
__field(__u64, currmid)
__field(char *, hostname)
__field(int, credits)
+ __field(int, credits_to_add)
),
TP_fast_assign(
__entry->currmid = currmid;
__entry->hostname = hostname;
__entry->credits = credits;
+ __entry->credits_to_add = credits_to_add;
),
- TP_printk("server=%s current_mid=0x%llx credits=%d",
+ TP_printk("server=%s current_mid=0x%llx credits=%d credits_to_add=%d",
__entry->hostname,
__entry->currmid,
- __entry->credits)
+ __entry->credits,
+ __entry->credits_to_add)
)
#define DEFINE_SMB3_CREDIT_EVENT(name) \
DEFINE_EVENT(smb3_credit_class, smb3_##name, \
TP_PROTO(__u64 currmid, \
char *hostname, \
- int credits), \
- TP_ARGS(currmid, hostname, credits))
+ int credits, \
+ int credits_to_add), \
+ TP_ARGS(currmid, hostname, credits, credits_to_add))
DEFINE_SMB3_CREDIT_EVENT(reconnect_with_invalid_credits);
DEFINE_SMB3_CREDIT_EVENT(credit_timeout);
+DEFINE_SMB3_CREDIT_EVENT(add_credits);
#endif /* _CIFS_TRACE_H */
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index ac7632482736..e27e255d40dd 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -563,7 +563,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits,
cifs_num_waiters_dec(server);
if (!rc) {
trace_smb3_credit_timeout(server->CurrentMid,
- server->hostname, num_credits);
+ server->hostname, num_credits, 0);
cifs_server_dbg(VFS, "wait timed out after %d ms\n",
timeout);
return -ENOTSUPP;
@@ -604,7 +604,8 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits,
if (!rc) {
trace_smb3_credit_timeout(
server->CurrentMid,
- server->hostname, num_credits);
+ server->hostname, num_credits,
+ 0);
cifs_server_dbg(VFS, "wait timed out after %d ms\n",
timeout);
return -ENOTSUPP;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 912308600d39..4b90cfd1ec36 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -690,8 +690,7 @@ static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail = 0;
buf->f_files = CRAMFS_SB(sb)->files;
buf->f_ffree = 0;
- buf->f_fsid.val[0] = (u32)id;
- buf->f_fsid.val[1] = (u32)(id >> 32);
+ buf->f_fsid = u64_to_fsid(id);
buf->f_namelen = CRAMFS_MAXPATHLEN;
return 0;
}
diff --git a/fs/dax.c b/fs/dax.c
index 6ad346352a8c..5b47834f2e1b 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -559,8 +559,11 @@ fallback:
}
/**
- * dax_layout_busy_page - find first pinned page in @mapping
+ * dax_layout_busy_page_range - find first pinned page in @mapping
* @mapping: address space to scan for a page with ref count > 1
+ * @start: Starting offset. Page containing 'start' is included.
+ * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX,
+ * pages from 'start' till the end of file are included.
*
* DAX requires ZONE_DEVICE mapped pages. These pages are never
* 'onlined' to the page allocator so they are considered idle when
@@ -573,12 +576,15 @@ fallback:
* to be able to run unmap_mapping_range() and subsequently not race
* mapping_mapped() becoming true.
*/
-struct page *dax_layout_busy_page(struct address_space *mapping)
+struct page *dax_layout_busy_page_range(struct address_space *mapping,
+ loff_t start, loff_t end)
{
- XA_STATE(xas, &mapping->i_pages, 0);
void *entry;
unsigned int scanned = 0;
struct page *page = NULL;
+ pgoff_t start_idx = start >> PAGE_SHIFT;
+ pgoff_t end_idx;
+ XA_STATE(xas, &mapping->i_pages, start_idx);
/*
* In the 'limited' case get_user_pages() for dax is disabled.
@@ -589,6 +595,11 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
if (!dax_mapping(mapping) || !mapping_mapped(mapping))
return NULL;
+ /* If end == LLONG_MAX, all pages from start to till end of file */
+ if (end == LLONG_MAX)
+ end_idx = ULONG_MAX;
+ else
+ end_idx = end >> PAGE_SHIFT;
/*
* If we race get_user_pages_fast() here either we'll see the
* elevated page count in the iteration and wait, or
@@ -596,15 +607,15 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
* against is no longer mapped in the page tables and bail to the
* get_user_pages() slow path. The slow path is protected by
* pte_lock() and pmd_lock(). New references are not taken without
- * holding those locks, and unmap_mapping_range() will not zero the
+ * holding those locks, and unmap_mapping_pages() will not zero the
* pte or pmd without holding the respective lock, so we are
* guaranteed to either see new references or prevent new
* references from being established.
*/
- unmap_mapping_range(mapping, 0, 0, 0);
+ unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0);
xas_lock_irq(&xas);
- xas_for_each(&xas, entry, ULONG_MAX) {
+ xas_for_each(&xas, entry, end_idx) {
if (WARN_ON_ONCE(!xa_is_value(entry)))
continue;
if (unlikely(dax_is_locked(entry)))
@@ -625,6 +636,12 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
xas_unlock_irq(&xas);
return page;
}
+EXPORT_SYMBOL_GPL(dax_layout_busy_page_range);
+
+struct page *dax_layout_busy_page(struct address_space *mapping)
+{
+ return dax_layout_busy_page_range(mapping, 0, LLONG_MAX);
+}
EXPORT_SYMBOL_GPL(dax_layout_busy_page);
static int __dax_invalidate_entry(struct address_space *mapping,
diff --git a/fs/efs/super.c b/fs/efs/super.c
index a4a945d0ac6a..62b155b9366b 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -342,8 +342,7 @@ static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) {
sbi->inode_blocks *
(EFS_BLOCKSIZE / sizeof(struct efs_dinode));
buf->f_ffree = sbi->inode_free; /* free inodes */
- buf->f_fsid.val[0] = (u32)id;
- buf->f_fsid.val[1] = (u32)(id >> 32);
+ buf->f_fsid = u64_to_fsid(id);
buf->f_namelen = EFS_MAXNAMELEN; /* max filename length */
return 0;
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index b9a09806512a..be10b16ea66e 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -561,8 +561,7 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_namelen = EROFS_NAME_LEN;
- buf->f_fsid.val[0] = (u32)id;
- buf->f_fsid.val[1] = (u32)(id >> 32);
+ buf->f_fsid = u64_to_fsid(id);
return 0;
}
diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index 573659bfbc55..916797077aad 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -59,9 +59,9 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb,
}
/* read a directory entry from the opened directory */
-static int exfat_readdir(struct inode *inode, struct exfat_dir_entry *dir_entry)
+static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_entry *dir_entry)
{
- int i, dentries_per_clu, dentries_per_clu_bits = 0;
+ int i, dentries_per_clu, dentries_per_clu_bits = 0, num_ext;
unsigned int type, clu_offset;
sector_t sector;
struct exfat_chain dir, clu;
@@ -70,7 +70,7 @@ static int exfat_readdir(struct inode *inode, struct exfat_dir_entry *dir_entry)
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(inode);
- unsigned int dentry = ei->rwoffset & 0xFFFFFFFF;
+ unsigned int dentry = EXFAT_B_TO_DEN(*cpos) & 0xFFFFFFFF;
struct buffer_head *bh;
/* check if the given file ID is opened */
@@ -127,6 +127,7 @@ static int exfat_readdir(struct inode *inode, struct exfat_dir_entry *dir_entry)
continue;
}
+ num_ext = ep->dentry.file.num_ext;
dir_entry->attr = le16_to_cpu(ep->dentry.file.attr);
exfat_get_entry_time(sbi, &dir_entry->crtime,
ep->dentry.file.create_tz,
@@ -157,12 +158,13 @@ static int exfat_readdir(struct inode *inode, struct exfat_dir_entry *dir_entry)
return -EIO;
dir_entry->size =
le64_to_cpu(ep->dentry.stream.valid_size);
+ dir_entry->entry = dentry;
brelse(bh);
ei->hint_bmap.off = dentry >> dentries_per_clu_bits;
ei->hint_bmap.clu = clu.dir;
- ei->rwoffset = ++dentry;
+ *cpos = EXFAT_DEN_TO_B(dentry + 1 + num_ext);
return 0;
}
@@ -178,7 +180,7 @@ static int exfat_readdir(struct inode *inode, struct exfat_dir_entry *dir_entry)
}
dir_entry->namebuf.lfn[0] = '\0';
- ei->rwoffset = dentry;
+ *cpos = EXFAT_DEN_TO_B(dentry);
return 0;
}
@@ -242,12 +244,10 @@ static int exfat_iterate(struct file *filp, struct dir_context *ctx)
if (err)
goto unlock;
get_new:
- ei->rwoffset = EXFAT_B_TO_DEN(cpos);
-
if (cpos >= i_size_read(inode))
goto end_of_dir;
- err = exfat_readdir(inode, &de);
+ err = exfat_readdir(inode, &cpos, &de);
if (err) {
/*
* At least we tried to read a sector. Move cpos to next sector
@@ -262,13 +262,10 @@ get_new:
goto end_of_dir;
}
- cpos = EXFAT_DEN_TO_B(ei->rwoffset);
-
if (!nb->lfn[0])
goto end_of_dir;
- i_pos = ((loff_t)ei->start_clu << 32) |
- ((ei->rwoffset - 1) & 0xffffffff);
+ i_pos = ((loff_t)ei->start_clu << 32) | (de.entry & 0xffffffff);
tmp = exfat_iget(sb, i_pos);
if (tmp) {
inum = tmp->i_ino;
@@ -911,7 +908,6 @@ enum {
/*
* return values:
* >= 0 : return dir entiry position with the name in dir
- * -EEXIST : (root dir, ".") it is the root dir itself
* -ENOENT : entry with the name does not exist
* -EIO : I/O error
*/
@@ -979,11 +975,8 @@ rewind:
if (ei->hint_femp.eidx ==
EXFAT_HINT_NONE ||
candi_empty.eidx <=
- ei->hint_femp.eidx) {
- memcpy(&ei->hint_femp,
- &candi_empty,
- sizeof(candi_empty));
- }
+ ei->hint_femp.eidx)
+ ei->hint_femp = candi_empty;
}
brelse(bh);
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index c013fe931d9c..b8f0e829ecbd 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -128,7 +128,7 @@ enum {
struct exfat_dentry_namebuf {
char *lfn;
- int lfnbuf_len; /* usally MAX_UNINAME_BUF_SIZE */
+ int lfnbuf_len; /* usually MAX_UNINAME_BUF_SIZE */
};
/* unicode name structure */
@@ -265,8 +265,6 @@ struct exfat_inode_info {
* the validation of hint_stat.
*/
unsigned int version;
- /* file offset or dentry index for readdir */
- loff_t rwoffset;
/* hint for cluster last accessed */
struct exfat_hint hint_bmap;
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index f41f523a58ad..a92478eabfa4 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -208,8 +208,6 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
/* hint information */
ei->hint_bmap.off = EXFAT_EOF_CLUSTER;
ei->hint_bmap.clu = EXFAT_EOF_CLUSTER;
- if (ei->rwoffset > new_size)
- ei->rwoffset = new_size;
/* hint_stat will be used if this is directory. */
ei->hint_stat.eidx = 0;
@@ -229,7 +227,7 @@ void exfat_truncate(struct inode *inode, loff_t size)
{
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
- unsigned int blocksize = 1 << inode->i_blkbits;
+ unsigned int blocksize = i_blocksize(inode);
loff_t aligned_size;
int err;
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index a6de17cac3df..730373e0965a 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -114,8 +114,6 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset,
unsigned int local_clu_offset = clu_offset;
unsigned int num_to_be_allocated = 0, num_clusters = 0;
- ei->rwoffset = EXFAT_CLU_TO_B(clu_offset, sbi);
-
if (EXFAT_I(inode)->i_size_ondisk > 0)
num_clusters =
EXFAT_B_TO_CLU_ROUND_UP(EXFAT_I(inode)->i_size_ondisk,
@@ -556,7 +554,7 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info)
struct exfat_inode_info *ei = EXFAT_I(inode);
loff_t size = info->size;
- memcpy(&ei->dir, &info->dir, sizeof(struct exfat_chain));
+ ei->dir = info->dir;
ei->entry = info->entry;
ei->attr = info->attr;
ei->start_clu = info->start_clu;
@@ -567,7 +565,6 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info)
ei->hint_stat.eidx = 0;
ei->hint_stat.clu = info->start_clu;
ei->hint_femp.eidx = EXFAT_HINT_NONE;
- ei->rwoffset = 0;
ei->hint_bmap.off = EXFAT_EOF_CLUSTER;
ei->i_pos = 0;
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index c94ac239f740..2932b23a3b6c 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -290,7 +290,7 @@ static int exfat_check_max_dentries(struct inode *inode)
{
if (EXFAT_B_TO_DEN(i_size_read(inode)) >= MAX_EXFAT_DENTRIES) {
/*
- * exFAT spec allows a dir to grow upto 8388608(256MB)
+ * exFAT spec allows a dir to grow up to 8388608(256MB)
* dentries
*/
return -ENOSPC;
@@ -318,8 +318,7 @@ static int exfat_find_empty_entry(struct inode *inode,
hint_femp.eidx = EXFAT_HINT_NONE;
if (ei->hint_femp.eidx != EXFAT_HINT_NONE) {
- memcpy(&hint_femp, &ei->hint_femp,
- sizeof(struct exfat_hint_femp));
+ hint_femp = ei->hint_femp;
ei->hint_femp.eidx = EXFAT_HINT_NONE;
}
@@ -519,7 +518,7 @@ static int exfat_add_entry(struct inode *inode, const char *path,
if (ret)
goto out;
- memcpy(&info->dir, p_dir, sizeof(struct exfat_chain));
+ info->dir = *p_dir;
info->entry = dentry;
info->flags = ALLOC_NO_FAT_CHAIN;
info->type = type;
@@ -530,19 +529,10 @@ static int exfat_add_entry(struct inode *inode, const char *path,
info->size = 0;
info->num_subdirs = 0;
} else {
- int count;
- struct exfat_chain cdir;
-
info->attr = ATTR_SUBDIR;
info->start_clu = start_clu;
info->size = clu_size;
-
- exfat_chain_set(&cdir, info->start_clu,
- EXFAT_B_TO_CLU(info->size, sbi), info->flags);
- count = exfat_count_dir_entries(sb, &cdir);
- if (count < 0)
- return -EIO;
- info->num_subdirs = count + EXFAT_MIN_SUBDIR;
+ info->num_subdirs = EXFAT_MIN_SUBDIR;
}
memset(&info->crtime, 0, sizeof(info->crtime));
memset(&info->mtime, 0, sizeof(info->mtime));
@@ -604,6 +594,8 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
struct super_block *sb = dir->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(dir);
+ struct exfat_dentry *ep, *ep2;
+ struct exfat_entry_set_cache *es;
if (qname->len == 0)
return -ENOENT;
@@ -629,91 +621,63 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
dentry = exfat_find_dir_entry(sb, ei, &cdir, &uni_name,
num_entries, TYPE_ALL);
- if ((dentry < 0) && (dentry != -EEXIST))
+ if (dentry < 0)
return dentry; /* -error value */
- memcpy(&info->dir, &cdir.dir, sizeof(struct exfat_chain));
+ info->dir = cdir;
info->entry = dentry;
info->num_subdirs = 0;
- /* root directory itself */
- if (unlikely(dentry == -EEXIST)) {
- int num_clu = 0;
+ es = exfat_get_dentry_set(sb, &cdir, dentry, ES_2_ENTRIES);
+ if (!es)
+ return -EIO;
+ ep = exfat_get_dentry_cached(es, 0);
+ ep2 = exfat_get_dentry_cached(es, 1);
+
+ info->type = exfat_get_entry_type(ep);
+ info->attr = le16_to_cpu(ep->dentry.file.attr);
+ info->size = le64_to_cpu(ep2->dentry.stream.valid_size);
+ if ((info->type == TYPE_FILE) && (info->size == 0)) {
+ info->flags = ALLOC_NO_FAT_CHAIN;
+ info->start_clu = EXFAT_EOF_CLUSTER;
+ } else {
+ info->flags = ep2->dentry.stream.flags;
+ info->start_clu =
+ le32_to_cpu(ep2->dentry.stream.start_clu);
+ }
- info->type = TYPE_DIR;
- info->attr = ATTR_SUBDIR;
- info->flags = ALLOC_FAT_CHAIN;
- info->start_clu = sbi->root_dir;
- memset(&info->crtime, 0, sizeof(info->crtime));
- memset(&info->mtime, 0, sizeof(info->mtime));
- memset(&info->atime, 0, sizeof(info->atime));
-
- exfat_chain_set(&cdir, sbi->root_dir, 0, ALLOC_FAT_CHAIN);
- if (exfat_count_num_clusters(sb, &cdir, &num_clu))
- return -EIO;
- info->size = num_clu << sbi->cluster_size_bits;
+ exfat_get_entry_time(sbi, &info->crtime,
+ ep->dentry.file.create_tz,
+ ep->dentry.file.create_time,
+ ep->dentry.file.create_date,
+ ep->dentry.file.create_time_cs);
+ exfat_get_entry_time(sbi, &info->mtime,
+ ep->dentry.file.modify_tz,
+ ep->dentry.file.modify_time,
+ ep->dentry.file.modify_date,
+ ep->dentry.file.modify_time_cs);
+ exfat_get_entry_time(sbi, &info->atime,
+ ep->dentry.file.access_tz,
+ ep->dentry.file.access_time,
+ ep->dentry.file.access_date,
+ 0);
+ exfat_free_dentry_set(es, false);
+
+ if (ei->start_clu == EXFAT_FREE_CLUSTER) {
+ exfat_fs_error(sb,
+ "non-zero size file starts with zero cluster (size : %llu, p_dir : %u, entry : 0x%08x)",
+ i_size_read(dir), ei->dir.dir, ei->entry);
+ return -EIO;
+ }
+ if (info->type == TYPE_DIR) {
+ exfat_chain_set(&cdir, info->start_clu,
+ EXFAT_B_TO_CLU(info->size, sbi), info->flags);
count = exfat_count_dir_entries(sb, &cdir);
if (count < 0)
return -EIO;
- info->num_subdirs = count;
- } else {
- struct exfat_dentry *ep, *ep2;
- struct exfat_entry_set_cache *es;
-
- es = exfat_get_dentry_set(sb, &cdir, dentry, ES_2_ENTRIES);
- if (!es)
- return -EIO;
- ep = exfat_get_dentry_cached(es, 0);
- ep2 = exfat_get_dentry_cached(es, 1);
-
- info->type = exfat_get_entry_type(ep);
- info->attr = le16_to_cpu(ep->dentry.file.attr);
- info->size = le64_to_cpu(ep2->dentry.stream.valid_size);
- if ((info->type == TYPE_FILE) && (info->size == 0)) {
- info->flags = ALLOC_NO_FAT_CHAIN;
- info->start_clu = EXFAT_EOF_CLUSTER;
- } else {
- info->flags = ep2->dentry.stream.flags;
- info->start_clu =
- le32_to_cpu(ep2->dentry.stream.start_clu);
- }
-
- if (ei->start_clu == EXFAT_FREE_CLUSTER) {
- exfat_fs_error(sb,
- "non-zero size file starts with zero cluster (size : %llu, p_dir : %u, entry : 0x%08x)",
- i_size_read(dir), ei->dir.dir, ei->entry);
- exfat_free_dentry_set(es, false);
- return -EIO;
- }
-
- exfat_get_entry_time(sbi, &info->crtime,
- ep->dentry.file.create_tz,
- ep->dentry.file.create_time,
- ep->dentry.file.create_date,
- ep->dentry.file.create_time_cs);
- exfat_get_entry_time(sbi, &info->mtime,
- ep->dentry.file.modify_tz,
- ep->dentry.file.modify_time,
- ep->dentry.file.modify_date,
- ep->dentry.file.modify_time_cs);
- exfat_get_entry_time(sbi, &info->atime,
- ep->dentry.file.access_tz,
- ep->dentry.file.access_time,
- ep->dentry.file.access_date,
- 0);
- exfat_free_dentry_set(es, false);
-
- if (info->type == TYPE_DIR) {
- exfat_chain_set(&cdir, info->start_clu,
- EXFAT_B_TO_CLU(info->size, sbi), info->flags);
- count = exfat_count_dir_entries(sb, &cdir);
- if (count < 0)
- return -EIO;
-
- info->num_subdirs = count + EXFAT_MIN_SUBDIR;
- }
+ info->num_subdirs = count + EXFAT_MIN_SUBDIR;
}
return 0;
}
@@ -1065,7 +1029,7 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir,
if (!epnew)
return -EIO;
- memcpy(epnew, epold, DENTRY_SIZE);
+ *epnew = *epold;
if (exfat_get_entry_type(epnew) == TYPE_FILE) {
epnew->dentry.file.attr |= cpu_to_le16(ATTR_ARCHIVE);
ei->attr |= ATTR_ARCHIVE;
@@ -1085,7 +1049,7 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir,
return -EIO;
}
- memcpy(epnew, epold, DENTRY_SIZE);
+ *epnew = *epold;
exfat_update_bh(new_bh, sync);
brelse(old_bh);
brelse(new_bh);
@@ -1130,11 +1094,6 @@ static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir,
if (!epmov)
return -EIO;
- /* check if the source and target directory is the same */
- if (exfat_get_entry_type(epmov) == TYPE_DIR &&
- le32_to_cpu(epmov->dentry.stream.start_clu) == p_newdir->dir)
- return -EINVAL;
-
num_old_entries = exfat_count_ext_entries(sb, p_olddir, oldentry,
epmov);
if (num_old_entries < 0)
@@ -1153,7 +1112,7 @@ static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir,
if (!epnew)
return -EIO;
- memcpy(epnew, epmov, DENTRY_SIZE);
+ *epnew = *epmov;
if (exfat_get_entry_type(epnew) == TYPE_FILE) {
epnew->dentry.file.attr |= cpu_to_le16(ATTR_ARCHIVE);
ei->attr |= ATTR_ARCHIVE;
@@ -1173,7 +1132,7 @@ static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir,
return -EIO;
}
- memcpy(epnew, epmov, DENTRY_SIZE);
+ *epnew = *epmov;
exfat_update_bh(new_bh, IS_DIRSYNC(inode));
brelse(mov_bh);
brelse(new_bh);
diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c
index a3c927501e67..675d0e7058c5 100644
--- a/fs/exfat/nls.c
+++ b/fs/exfat/nls.c
@@ -11,7 +11,7 @@
#include "exfat_raw.h"
#include "exfat_fs.h"
-/* Upcase tabel macro */
+/* Upcase table macro */
#define EXFAT_NUM_UPCASE (2918)
#define UTBL_COUNT (0x10000)
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index 60b941ba557b..87be5bfc31eb 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -89,8 +89,7 @@ static int exfat_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_blocks = sbi->num_clusters - 2; /* clu 0 & 1 */
buf->f_bfree = buf->f_blocks - sbi->used_clusters;
buf->f_bavail = buf->f_bfree;
- buf->f_fsid.val[0] = (unsigned int)id;
- buf->f_fsid.val[1] = (unsigned int)(id >> 32);
+ buf->f_fsid = u64_to_fsid(id);
/* Unicode utf16 255 characters */
buf->f_namelen = EXFAT_MAX_FILE_LEN * NLS_MAX_CHARSET_SIZE;
return 0;
@@ -342,7 +341,6 @@ static int exfat_read_root(struct inode *inode)
ei->flags = ALLOC_FAT_CHAIN;
ei->type = TYPE_DIR;
ei->version = 0;
- ei->rwoffset = 0;
ei->hint_bmap.off = EXFAT_EOF_CLUSTER;
ei->hint_stat.eidx = 0;
ei->hint_stat.clu = sbi->root_dir;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 7fab2b3b5b39..09f1fe676972 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1455,8 +1455,7 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
buf->f_namelen = EXT2_NAME_LEN;
fsid = le64_to_cpup((void *)es->s_uuid) ^
le64_to_cpup((void *)es->s_uuid + sizeof(u64));
- buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
- buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+ buf->f_fsid = u64_to_fsid(fsid);
spin_unlock(&sbi->s_lock);
return 0;
}
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 2e42f47a7f98..49e7af6cc93f 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -10,7 +10,7 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \
indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \
mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \
super.o symlink.o sysfs.o xattr.o xattr_hurd.o xattr_trusted.o \
- xattr_user.o
+ xattr_user.o fast_commit.o
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 76f634d185f1..68aaed48315f 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -242,6 +242,7 @@ retry:
handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
if (IS_ERR(handle))
return PTR_ERR(handle);
+ ext4_fc_start_update(inode);
if ((type == ACL_TYPE_ACCESS) && acl) {
error = posix_acl_update_mode(inode, &mode, &acl);
@@ -259,6 +260,7 @@ retry:
}
out_stop:
ext4_journal_stop(handle);
+ ext4_fc_stop_update(inode);
if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
return error;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 48c3df47748d..1d640b145637 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -368,7 +368,12 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
struct buffer_head *bh)
{
ext4_fsblk_t blk;
- struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
+ struct ext4_group_info *grp;
+
+ if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
+ return 0;
+
+ grp = ext4_get_group_info(sb, block_group);
if (buffer_verified(bh))
return 0;
@@ -495,10 +500,9 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
*/
set_buffer_new(bh);
trace_ext4_read_block_bitmap_load(sb, block_group, ignore_locked);
- bh->b_end_io = ext4_end_bitmap_read;
- get_bh(bh);
- submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO |
- (ignore_locked ? REQ_RAHEAD : 0), bh);
+ ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO |
+ (ignore_locked ? REQ_RAHEAD : 0),
+ ext4_end_bitmap_read);
return bh;
verify:
err = ext4_validate_block_bitmap(sb, desc, block_group, bh);
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index c54ba52f2dd4..8e6ca23ed172 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -131,7 +131,7 @@ static void debug_print_tree(struct ext4_sb_info *sbi)
printk(KERN_INFO "System zones: ");
rcu_read_lock();
- system_blks = rcu_dereference(sbi->system_blks);
+ system_blks = rcu_dereference(sbi->s_system_blks);
node = rb_first(&system_blks->root);
while (node) {
entry = rb_entry(node, struct ext4_system_zone, node);
@@ -261,7 +261,7 @@ int ext4_setup_system_zone(struct super_block *sb)
* with ext4_data_block_valid() accessing the rbtree at the same
* time.
*/
- rcu_assign_pointer(sbi->system_blks, system_blks);
+ rcu_assign_pointer(sbi->s_system_blks, system_blks);
if (test_opt(sb, DEBUG))
debug_print_tree(sbi);
@@ -286,9 +286,9 @@ void ext4_release_system_zone(struct super_block *sb)
{
struct ext4_system_blocks *system_blks;
- system_blks = rcu_dereference_protected(EXT4_SB(sb)->system_blks,
+ system_blks = rcu_dereference_protected(EXT4_SB(sb)->s_system_blks,
lockdep_is_held(&sb->s_umount));
- rcu_assign_pointer(EXT4_SB(sb)->system_blks, NULL);
+ rcu_assign_pointer(EXT4_SB(sb)->s_system_blks, NULL);
if (system_blks)
call_rcu(&system_blks->rcu, ext4_destroy_system_zone);
@@ -319,7 +319,7 @@ int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk,
* mount option.
*/
rcu_read_lock();
- system_blks = rcu_dereference(sbi->system_blks);
+ system_blks = rcu_dereference(sbi->s_system_blks);
if (system_blks == NULL)
goto out_rcu;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index efe77cffc322..5b81f3b080ee 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -674,7 +674,7 @@ static int ext4_d_compare(const struct dentry *dentry, unsigned int len,
{
struct qstr qstr = {.name = str, .len = len };
const struct dentry *parent = READ_ONCE(dentry->d_parent);
- const struct inode *inode = READ_ONCE(parent->d_inode);
+ const struct inode *inode = d_inode_rcu(parent);
char strbuf[DNAME_INLINE_LEN];
if (!inode || !IS_CASEFOLDED(inode) ||
@@ -706,7 +706,7 @@ static int ext4_d_hash(const struct dentry *dentry, struct qstr *str)
{
const struct ext4_sb_info *sbi = EXT4_SB(dentry->d_sb);
const struct unicode_map *um = sbi->s_encoding;
- const struct inode *inode = READ_ONCE(dentry->d_inode);
+ const struct inode *inode = d_inode_rcu(dentry);
unsigned char *norm;
int len, ret = 0;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f9a692c0a66c..254d1c26bea8 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -27,7 +27,6 @@
#include <linux/seqlock.h>
#include <linux/mutex.h>
#include <linux/timer.h>
-#include <linux/version.h>
#include <linux/wait.h>
#include <linux/sched/signal.h>
#include <linux/blockgroup_lock.h>
@@ -492,7 +491,7 @@ struct flex_groups {
/* Flags which are mutually exclusive to DAX */
#define EXT4_DAX_MUT_EXCL (EXT4_VERITY_FL | EXT4_ENCRYPT_FL |\
- EXT4_JOURNAL_DATA_FL)
+ EXT4_JOURNAL_DATA_FL | EXT4_INLINE_DATA_FL)
/* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
@@ -964,6 +963,7 @@ do { \
#endif /* defined(__KERNEL__) || defined(__linux__) */
#include "extents_status.h"
+#include "fast_commit.h"
/*
* Lock subclasses for i_data_sem in the ext4_inode_info structure.
@@ -1021,6 +1021,31 @@ struct ext4_inode_info {
struct list_head i_orphan; /* unlinked but open inodes */
+ /* Fast commit related info */
+
+ struct list_head i_fc_list; /*
+ * inodes that need fast commit
+ * protected by sbi->s_fc_lock.
+ */
+
+ /* Fast commit subtid when this inode was committed */
+ unsigned int i_fc_committed_subtid;
+
+ /* Start of lblk range that needs to be committed in this fast commit */
+ ext4_lblk_t i_fc_lblk_start;
+
+ /* End of lblk range that needs to be committed in this fast commit */
+ ext4_lblk_t i_fc_lblk_len;
+
+ /* Number of ongoing updates on this inode */
+ atomic_t i_fc_updates;
+
+ /* Fast commit wait queue for this inode */
+ wait_queue_head_t i_fc_wait;
+
+ /* Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len */
+ struct mutex i_fc_lock;
+
/*
* i_disksize keeps track of what the inode size is ON DISK, not
* in memory. During truncate, i_size is set to the new size by
@@ -1141,6 +1166,11 @@ struct ext4_inode_info {
#define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */
#define EXT4_ERROR_FS 0x0002 /* Errors detected */
#define EXT4_ORPHAN_FS 0x0004 /* Orphans being recovered */
+#define EXT4_FC_INELIGIBLE 0x0008 /* Fast commit ineligible */
+#define EXT4_FC_COMMITTING 0x0010 /* File system underoing a fast
+ * commit.
+ */
+#define EXT4_FC_REPLAY 0x0020 /* Fast commit replay ongoing */
/*
* Misc. filesystem flags
@@ -1214,6 +1244,8 @@ struct ext4_inode_info {
#define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly
specified journal checksum */
+#define EXT4_MOUNT2_JOURNAL_FAST_COMMIT 0x00000010 /* Journal fast commit */
+
#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
~EXT4_MOUNT_##opt
#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \
@@ -1481,14 +1513,14 @@ struct ext4_sb_info {
unsigned long s_commit_interval;
u32 s_max_batch_time;
u32 s_min_batch_time;
- struct block_device *journal_bdev;
+ struct block_device *s_journal_bdev;
#ifdef CONFIG_QUOTA
/* Names of quota files with journalled quota */
char __rcu *s_qf_names[EXT4_MAXQUOTAS];
int s_jquota_fmt; /* Format of quota to use */
#endif
unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
- struct ext4_system_blocks __rcu *system_blks;
+ struct ext4_system_blocks __rcu *s_system_blks;
#ifdef EXTENTS_STATS
/* ext4 extents stats */
@@ -1611,6 +1643,34 @@ struct ext4_sb_info {
/* Record the errseq of the backing block device */
errseq_t s_bdev_wb_err;
spinlock_t s_bdev_wb_lock;
+
+ /* Ext4 fast commit stuff */
+ atomic_t s_fc_subtid;
+ atomic_t s_fc_ineligible_updates;
+ /*
+ * After commit starts, the main queue gets locked, and the further
+ * updates get added in the staging queue.
+ */
+#define FC_Q_MAIN 0
+#define FC_Q_STAGING 1
+ struct list_head s_fc_q[2]; /* Inodes staged for fast commit
+ * that have data changes in them.
+ */
+ struct list_head s_fc_dentry_q[2]; /* directory entry updates */
+ unsigned int s_fc_bytes;
+ /*
+ * Main fast commit lock. This lock protects accesses to the
+ * following fields:
+ * ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh.
+ */
+ spinlock_t s_fc_lock;
+ struct buffer_head *s_fc_bh;
+ struct ext4_fc_stats s_fc_stats;
+ u64 s_fc_avg_commit_time;
+#ifdef CONFIG_EXT4_DEBUG
+ int s_fc_debug_max_replay;
+#endif
+ struct ext4_fc_replay_state s_fc_replay_state;
};
static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1721,6 +1781,7 @@ enum {
EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */
EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */
+ EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -1814,6 +1875,7 @@ static inline bool ext4_verity_in_progress(struct inode *inode)
#define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010
#define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020
#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200
+#define EXT4_FEATURE_COMPAT_FAST_COMMIT 0x0400
#define EXT4_FEATURE_COMPAT_STABLE_INODES 0x0800
#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001
@@ -1916,6 +1978,7 @@ EXT4_FEATURE_COMPAT_FUNCS(xattr, EXT_ATTR)
EXT4_FEATURE_COMPAT_FUNCS(resize_inode, RESIZE_INODE)
EXT4_FEATURE_COMPAT_FUNCS(dir_index, DIR_INDEX)
EXT4_FEATURE_COMPAT_FUNCS(sparse_super2, SPARSE_SUPER2)
+EXT4_FEATURE_COMPAT_FUNCS(fast_commit, FAST_COMMIT)
EXT4_FEATURE_COMPAT_FUNCS(stable_inodes, STABLE_INODES)
EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super, SPARSE_SUPER)
@@ -2650,6 +2713,7 @@ extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
struct dx_hash_info *hinfo);
/* ialloc.c */
+extern int ext4_mark_inode_used(struct super_block *sb, int ino);
extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t,
const struct qstr *qstr, __u32 goal,
uid_t *owner, __u32 i_flags,
@@ -2675,6 +2739,27 @@ extern int ext4_init_inode_table(struct super_block *sb,
ext4_group_t group, int barrier);
extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
+/* fast_commit.c */
+int ext4_fc_info_show(struct seq_file *seq, void *v);
+void ext4_fc_init(struct super_block *sb, journal_t *journal);
+void ext4_fc_init_inode(struct inode *inode);
+void ext4_fc_track_range(struct inode *inode, ext4_lblk_t start,
+ ext4_lblk_t end);
+void ext4_fc_track_unlink(struct inode *inode, struct dentry *dentry);
+void ext4_fc_track_link(struct inode *inode, struct dentry *dentry);
+void ext4_fc_track_create(struct inode *inode, struct dentry *dentry);
+void ext4_fc_track_inode(struct inode *inode);
+void ext4_fc_mark_ineligible(struct super_block *sb, int reason);
+void ext4_fc_start_ineligible(struct super_block *sb, int reason);
+void ext4_fc_stop_ineligible(struct super_block *sb);
+void ext4_fc_start_update(struct inode *inode);
+void ext4_fc_stop_update(struct inode *inode);
+void ext4_fc_del(struct inode *inode);
+bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block);
+void ext4_fc_replay_cleanup(struct super_block *sb);
+int ext4_fc_commit(journal_t *journal, tid_t commit_tid);
+int __init ext4_fc_init_dentry_cache(void);
+
/* mballoc.c */
extern const struct seq_operations ext4_mb_seq_groups_ops;
extern long ext4_mb_stats;
@@ -2704,8 +2789,12 @@ extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
ext4_fsblk_t block, unsigned long count);
extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid);
+extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
+ int len, int state);
/* inode.c */
+void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
+ struct ext4_inode_info *ei);
int ext4_inode_is_fast_symlink(struct inode *inode);
struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
@@ -2752,6 +2841,8 @@ extern int ext4_sync_inode(handle_t *, struct inode *);
extern void ext4_dirty_inode(struct inode *, int);
extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
+extern int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
+ struct ext4_iloc *iloc);
extern int ext4_inode_attach_jinode(struct inode *inode);
extern int ext4_can_truncate(struct inode *inode);
extern int ext4_truncate(struct inode *);
@@ -2785,12 +2876,15 @@ extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
+extern void ext4_reset_inode_seed(struct inode *inode);
/* migrate.c */
extern int ext4_ext_migrate(struct inode *);
extern int ext4_ind_migrate(struct inode *inode);
/* namei.c */
+extern int ext4_init_new_dir(handle_t *handle, struct inode *dir,
+ struct inode *inode);
extern int ext4_dirblock_csum_verify(struct inode *inode,
struct buffer_head *bh);
extern int ext4_orphan_add(handle_t *, struct inode *);
@@ -2824,6 +2918,14 @@ extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
/* super.c */
extern struct buffer_head *ext4_sb_bread(struct super_block *sb,
sector_t block, int op_flags);
+extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
+ sector_t block);
+extern void ext4_read_bh_nowait(struct buffer_head *bh, int op_flags,
+ bh_end_io_t *end_io);
+extern int ext4_read_bh(struct buffer_head *bh, int op_flags,
+ bh_end_io_t *end_io);
+extern int ext4_read_bh_lock(struct buffer_head *bh, int op_flags, bool wait);
+extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block);
extern int ext4_seq_options_show(struct seq_file *seq, void *offset);
extern int ext4_calculate_overhead(struct super_block *sb);
extern void ext4_superblock_csum_set(struct super_block *sb);
@@ -3012,22 +3114,24 @@ static inline int ext4_has_group_desc_csum(struct super_block *sb)
return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb);
}
+#define ext4_read_incompat_64bit_val(es, name) \
+ (((es)->s_feature_incompat & cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT) \
+ ? (ext4_fsblk_t)le32_to_cpu(es->name##_hi) << 32 : 0) | \
+ le32_to_cpu(es->name##_lo))
+
static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
{
- return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
- le32_to_cpu(es->s_blocks_count_lo);
+ return ext4_read_incompat_64bit_val(es, s_blocks_count);
}
static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es)
{
- return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) |
- le32_to_cpu(es->s_r_blocks_count_lo);
+ return ext4_read_incompat_64bit_val(es, s_r_blocks_count);
}
static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es)
{
- return ((ext4_fsblk_t)le32_to_cpu(es->s_free_blocks_count_hi) << 32) |
- le32_to_cpu(es->s_free_blocks_count_lo);
+ return ext4_read_incompat_64bit_val(es, s_free_blocks_count);
}
static inline void ext4_blocks_count_set(struct ext4_super_block *es,
@@ -3153,6 +3257,9 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
struct ext4_group_info {
unsigned long bb_state;
+#ifdef AGGRESSIVE_CHECK
+ unsigned long bb_check_counter;
+#endif
struct rb_root bb_free_root;
ext4_grpblk_t bb_first_free; /* first free block */
ext4_grpblk_t bb_free; /* total free blocks */
@@ -3357,6 +3464,10 @@ extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode,
extern int ext4_ci_compare(const struct inode *parent,
const struct qstr *fname,
const struct qstr *entry, bool quick);
+extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
+ struct inode *inode);
+extern int __ext4_link(struct inode *dir, struct inode *inode,
+ struct dentry *dentry);
#define S_SHIFT 12
static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = {
@@ -3457,6 +3568,11 @@ extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu);
extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
int check_cred, int restart_cred,
int revoke_cred);
+extern void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end);
+extern int ext4_ext_replay_set_iblocks(struct inode *inode);
+extern int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
+ int len, int unwritten, ext4_fsblk_t pblk);
+extern int ext4_ext_clear_bb(struct inode *inode);
/* move_extent.c */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 760b9ee49dc0..0fd0c42a4f7d 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -100,7 +100,7 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
return ERR_PTR(err);
journal = EXT4_SB(sb)->s_journal;
- if (!journal)
+ if (!journal || (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
return ext4_get_nojournal();
return jbd2__journal_start(journal, blocks, rsv_blocks, revoke_creds,
GFP_NOFS, type, line);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index a0481582187a..559100f3e23c 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -501,7 +501,7 @@ __read_extent_tree_block(const char *function, unsigned int line,
if (!bh_uptodate_or_lock(bh)) {
trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
- err = bh_submit_read(bh);
+ err = ext4_read_bh(bh, 0, NULL);
if (err < 0)
goto errout;
}
@@ -3723,6 +3723,7 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
out:
ext4_ext_show_leaf(inode, path);
+ ext4_fc_track_range(inode, ee_block, ee_block + ee_len - 1);
return err;
}
@@ -3794,6 +3795,7 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
if (*allocated > map->m_len)
*allocated = map->m_len;
map->m_len = *allocated;
+ ext4_fc_track_range(inode, ee_block, ee_block + ee_len - 1);
return 0;
}
@@ -4023,7 +4025,7 @@ static int get_implied_cluster_alloc(struct super_block *sb,
* down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
* (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
*
- * return > 0, number of of blocks already mapped/allocated
+ * return > 0, number of blocks already mapped/allocated
* if create == 0 and these are pre-allocated blocks
* buffer head is unmapped
* otherwise blocks are mapped
@@ -4327,7 +4329,7 @@ got_allocated_blocks:
map->m_len = ar.len;
allocated = map->m_len;
ext4_ext_show_leaf(inode, path);
-
+ ext4_fc_track_range(inode, map->m_lblk, map->m_lblk + map->m_len - 1);
out:
ext4_ext_drop_refs(path);
kfree(path);
@@ -4600,7 +4602,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
ret = ext4_mark_inode_dirty(handle, inode);
if (unlikely(ret))
goto out_handle;
-
+ ext4_fc_track_range(inode, offset >> inode->i_sb->s_blocksize_bits,
+ (offset + len - 1) >> inode->i_sb->s_blocksize_bits);
/* Zero out partial block at the edges of the range */
ret = ext4_zero_partial_blocks(handle, inode, offset, len);
if (ret >= 0)
@@ -4648,23 +4651,34 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
FALLOC_FL_INSERT_RANGE))
return -EOPNOTSUPP;
+ ext4_fc_track_range(inode, offset >> blkbits,
+ (offset + len - 1) >> blkbits);
- if (mode & FALLOC_FL_PUNCH_HOLE)
- return ext4_punch_hole(inode, offset, len);
+ ext4_fc_start_update(inode);
+
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ ret = ext4_punch_hole(inode, offset, len);
+ goto exit;
+ }
ret = ext4_convert_inline_data(inode);
if (ret)
- return ret;
+ goto exit;
- if (mode & FALLOC_FL_COLLAPSE_RANGE)
- return ext4_collapse_range(inode, offset, len);
-
- if (mode & FALLOC_FL_INSERT_RANGE)
- return ext4_insert_range(inode, offset, len);
+ if (mode & FALLOC_FL_COLLAPSE_RANGE) {
+ ret = ext4_collapse_range(inode, offset, len);
+ goto exit;
+ }
- if (mode & FALLOC_FL_ZERO_RANGE)
- return ext4_zero_range(file, offset, len, mode);
+ if (mode & FALLOC_FL_INSERT_RANGE) {
+ ret = ext4_insert_range(inode, offset, len);
+ goto exit;
+ }
+ if (mode & FALLOC_FL_ZERO_RANGE) {
+ ret = ext4_zero_range(file, offset, len, mode);
+ goto exit;
+ }
trace_ext4_fallocate_enter(inode, offset, len, mode);
lblk = offset >> blkbits;
@@ -4698,12 +4712,14 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
goto out;
if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
- ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
- EXT4_I(inode)->i_sync_tid);
+ ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
+ EXT4_I(inode)->i_sync_tid);
}
out:
inode_unlock(inode);
trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
+exit:
+ ext4_fc_stop_update(inode);
return ret;
}
@@ -4769,7 +4785,7 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end)
{
- int ret, err = 0;
+ int ret = 0, err = 0;
struct ext4_io_end_vec *io_end_vec;
/*
@@ -5291,6 +5307,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
ret = PTR_ERR(handle);
goto out_mmap;
}
+ ext4_fc_start_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE);
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode, 0);
@@ -5329,6 +5346,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
out_stop:
ext4_journal_stop(handle);
+ ext4_fc_stop_ineligible(sb);
out_mmap:
up_write(&EXT4_I(inode)->i_mmap_sem);
out_mutex:
@@ -5429,6 +5447,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
ret = PTR_ERR(handle);
goto out_mmap;
}
+ ext4_fc_start_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE);
/* Expand file to avoid data loss if there is error while shifting */
inode->i_size += len;
@@ -5503,6 +5522,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
out_stop:
ext4_journal_stop(handle);
+ ext4_fc_stop_ineligible(sb);
out_mmap:
up_write(&EXT4_I(inode)->i_mmap_sem);
out_mutex:
@@ -5784,3 +5804,264 @@ out:
return err ? err : mapped;
}
+
+/*
+ * Updates physical block address and unwritten status of extent
+ * starting at lblk start and of len. If such an extent doesn't exist,
+ * this function splits the extent tree appropriately to create an
+ * extent like this. This function is called in the fast commit
+ * replay path. Returns 0 on success and error on failure.
+ */
+int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
+ int len, int unwritten, ext4_fsblk_t pblk)
+{
+ struct ext4_ext_path *path = NULL, *ppath;
+ struct ext4_extent *ex;
+ int ret;
+
+ path = ext4_find_extent(inode, start, NULL, 0);
+ if (!path)
+ return -EINVAL;
+ ex = path[path->p_depth].p_ext;
+ if (!ex) {
+ ret = -EFSCORRUPTED;
+ goto out;
+ }
+
+ if (le32_to_cpu(ex->ee_block) != start ||
+ ext4_ext_get_actual_len(ex) != len) {
+ /* We need to split this extent to match our extent first */
+ ppath = path;
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_force_split_extent_at(NULL, inode, &ppath, start, 1);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ if (ret)
+ goto out;
+ kfree(path);
+ path = ext4_find_extent(inode, start, NULL, 0);
+ if (IS_ERR(path))
+ return -1;
+ ppath = path;
+ ex = path[path->p_depth].p_ext;
+ WARN_ON(le32_to_cpu(ex->ee_block) != start);
+ if (ext4_ext_get_actual_len(ex) != len) {
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_force_split_extent_at(NULL, inode, &ppath,
+ start + len, 1);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ if (ret)
+ goto out;
+ kfree(path);
+ path = ext4_find_extent(inode, start, NULL, 0);
+ if (IS_ERR(path))
+ return -EINVAL;
+ ex = path[path->p_depth].p_ext;
+ }
+ }
+ if (unwritten)
+ ext4_ext_mark_unwritten(ex);
+ else
+ ext4_ext_mark_initialized(ex);
+ ext4_ext_store_pblock(ex, pblk);
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
+ up_write(&EXT4_I(inode)->i_data_sem);
+out:
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ ext4_mark_inode_dirty(NULL, inode);
+ return ret;
+}
+
+/* Try to shrink the extent tree */
+void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end)
+{
+ struct ext4_ext_path *path = NULL;
+ struct ext4_extent *ex;
+ ext4_lblk_t old_cur, cur = 0;
+
+ while (cur < end) {
+ path = ext4_find_extent(inode, cur, NULL, 0);
+ if (IS_ERR(path))
+ return;
+ ex = path[path->p_depth].p_ext;
+ if (!ex) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ ext4_mark_inode_dirty(NULL, inode);
+ return;
+ }
+ old_cur = cur;
+ cur = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
+ if (cur <= old_cur)
+ cur = old_cur + 1;
+ ext4_ext_try_to_merge(NULL, inode, path, ex);
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ ext4_mark_inode_dirty(NULL, inode);
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ }
+}
+
+/* Check if *cur is a hole and if it is, skip it */
+static void skip_hole(struct inode *inode, ext4_lblk_t *cur)
+{
+ int ret;
+ struct ext4_map_blocks map;
+
+ map.m_lblk = *cur;
+ map.m_len = ((inode->i_size) >> inode->i_sb->s_blocksize_bits) - *cur;
+
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret != 0)
+ return;
+ *cur = *cur + map.m_len;
+}
+
+/* Count number of blocks used by this inode and update i_blocks */
+int ext4_ext_replay_set_iblocks(struct inode *inode)
+{
+ struct ext4_ext_path *path = NULL, *path2 = NULL;
+ struct ext4_extent *ex;
+ ext4_lblk_t cur = 0, end;
+ int numblks = 0, i, ret = 0;
+ ext4_fsblk_t cmp1, cmp2;
+ struct ext4_map_blocks map;
+
+ /* Determin the size of the file first */
+ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
+ EXT4_EX_NOCACHE);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+ ex = path[path->p_depth].p_ext;
+ if (!ex) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ goto out;
+ }
+ end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
+ ext4_ext_drop_refs(path);
+ kfree(path);
+
+ /* Count the number of data blocks */
+ cur = 0;
+ while (cur < end) {
+ map.m_lblk = cur;
+ map.m_len = end - cur;
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ break;
+ if (ret > 0)
+ numblks += ret;
+ cur = cur + map.m_len;
+ }
+
+ /*
+ * Count the number of extent tree blocks. We do it by looking up
+ * two successive extents and determining the difference between
+ * their paths. When path is different for 2 successive extents
+ * we compare the blocks in the path at each level and increment
+ * iblocks by total number of differences found.
+ */
+ cur = 0;
+ skip_hole(inode, &cur);
+ path = ext4_find_extent(inode, cur, NULL, 0);
+ if (IS_ERR(path))
+ goto out;
+ numblks += path->p_depth;
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ while (cur < end) {
+ path = ext4_find_extent(inode, cur, NULL, 0);
+ if (IS_ERR(path))
+ break;
+ ex = path[path->p_depth].p_ext;
+ if (!ex) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ return 0;
+ }
+ cur = max(cur + 1, le32_to_cpu(ex->ee_block) +
+ ext4_ext_get_actual_len(ex));
+ skip_hole(inode, &cur);
+
+ path2 = ext4_find_extent(inode, cur, NULL, 0);
+ if (IS_ERR(path2)) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ break;
+ }
+ ex = path2[path2->p_depth].p_ext;
+ for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) {
+ cmp1 = cmp2 = 0;
+ if (i <= path->p_depth)
+ cmp1 = path[i].p_bh ?
+ path[i].p_bh->b_blocknr : 0;
+ if (i <= path2->p_depth)
+ cmp2 = path2[i].p_bh ?
+ path2[i].p_bh->b_blocknr : 0;
+ if (cmp1 != cmp2 && cmp2 != 0)
+ numblks++;
+ }
+ ext4_ext_drop_refs(path);
+ ext4_ext_drop_refs(path2);
+ kfree(path);
+ kfree(path2);
+ }
+
+out:
+ inode->i_blocks = numblks << (inode->i_sb->s_blocksize_bits - 9);
+ ext4_mark_inode_dirty(NULL, inode);
+ return 0;
+}
+
+int ext4_ext_clear_bb(struct inode *inode)
+{
+ struct ext4_ext_path *path = NULL;
+ struct ext4_extent *ex;
+ ext4_lblk_t cur = 0, end;
+ int j, ret = 0;
+ struct ext4_map_blocks map;
+
+ /* Determin the size of the file first */
+ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
+ EXT4_EX_NOCACHE);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+ ex = path[path->p_depth].p_ext;
+ if (!ex) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ return 0;
+ }
+ end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
+ ext4_ext_drop_refs(path);
+ kfree(path);
+
+ cur = 0;
+ while (cur < end) {
+ map.m_lblk = cur;
+ map.m_len = end - cur;
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ break;
+ if (ret > 0) {
+ path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
+ if (!IS_ERR_OR_NULL(path)) {
+ for (j = 0; j < path->p_depth; j++) {
+
+ ext4_mb_mark_bb(inode->i_sb,
+ path[j].p_block, 1, 0);
+ }
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ }
+ ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
+ }
+ cur = cur + map.m_len;
+ }
+
+ return 0;
+}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index e75171535375..0a729027322d 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -311,6 +311,9 @@ void ext4_es_find_extent_range(struct inode *inode,
ext4_lblk_t lblk, ext4_lblk_t end,
struct extent_status *es)
{
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return;
+
trace_ext4_es_find_extent_range_enter(inode, lblk);
read_lock(&EXT4_I(inode)->i_es_lock);
@@ -361,6 +364,9 @@ bool ext4_es_scan_range(struct inode *inode,
{
bool ret;
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return false;
+
read_lock(&EXT4_I(inode)->i_es_lock);
ret = __es_scan_range(inode, matching_fn, lblk, end);
read_unlock(&EXT4_I(inode)->i_es_lock);
@@ -404,6 +410,9 @@ bool ext4_es_scan_clu(struct inode *inode,
{
bool ret;
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return false;
+
read_lock(&EXT4_I(inode)->i_es_lock);
ret = __es_scan_clu(inode, matching_fn, lblk);
read_unlock(&EXT4_I(inode)->i_es_lock);
@@ -812,6 +821,9 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
int err = 0;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return 0;
+
es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
lblk, len, pblk, status, inode->i_ino);
@@ -873,6 +885,9 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
struct extent_status newes;
ext4_lblk_t end = lblk + len - 1;
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return;
+
newes.es_lblk = lblk;
newes.es_len = len;
ext4_es_store_pblock_status(&newes, pblk, status);
@@ -908,6 +923,9 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
struct rb_node *node;
int found = 0;
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return 0;
+
trace_ext4_es_lookup_extent_enter(inode, lblk);
es_debug("lookup extent in block %u\n", lblk);
@@ -1419,6 +1437,9 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
int err = 0;
int reserved = 0;
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return 0;
+
trace_ext4_es_remove_extent(inode, lblk, len);
es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
lblk, len, inode->i_ino);
@@ -1969,6 +1990,9 @@ int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
struct extent_status newes;
int err = 0;
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return 0;
+
es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
lblk, inode->i_ino);
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
new file mode 100644
index 000000000000..447c8d93f480
--- /dev/null
+++ b/fs/ext4/fast_commit.c
@@ -0,0 +1,2139 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * fs/ext4/fast_commit.c
+ *
+ * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
+ *
+ * Ext4 fast commits routines.
+ */
+#include "ext4.h"
+#include "ext4_jbd2.h"
+#include "ext4_extents.h"
+#include "mballoc.h"
+
+/*
+ * Ext4 Fast Commits
+ * -----------------
+ *
+ * Ext4 fast commits implement fine grained journalling for Ext4.
+ *
+ * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
+ * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
+ * TLV during the recovery phase. For the scenarios for which we currently
+ * don't have replay code, fast commit falls back to full commits.
+ * Fast commits record delta in one of the following three categories.
+ *
+ * (A) Directory entry updates:
+ *
+ * - EXT4_FC_TAG_UNLINK - records directory entry unlink
+ * - EXT4_FC_TAG_LINK - records directory entry link
+ * - EXT4_FC_TAG_CREAT - records inode and directory entry creation
+ *
+ * (B) File specific data range updates:
+ *
+ * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode
+ * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode
+ *
+ * (C) Inode metadata (mtime / ctime etc):
+ *
+ * - EXT4_FC_TAG_INODE - record the inode that should be replayed
+ * during recovery. Note that iblocks field is
+ * not replayed and instead derived during
+ * replay.
+ * Commit Operation
+ * ----------------
+ * With fast commits, we maintain all the directory entry operations in the
+ * order in which they are issued in an in-memory queue. This queue is flushed
+ * to disk during the commit operation. We also maintain a list of inodes
+ * that need to be committed during a fast commit in another in memory queue of
+ * inodes. During the commit operation, we commit in the following order:
+ *
+ * [1] Lock inodes for any further data updates by setting COMMITTING state
+ * [2] Submit data buffers of all the inodes
+ * [3] Wait for [2] to complete
+ * [4] Commit all the directory entry updates in the fast commit space
+ * [5] Commit all the changed inode structures
+ * [6] Write tail tag (this tag ensures the atomicity, please read the following
+ * section for more details).
+ * [7] Wait for [4], [5] and [6] to complete.
+ *
+ * All the inode updates must call ext4_fc_start_update() before starting an
+ * update. If such an ongoing update is present, fast commit waits for it to
+ * complete. The completion of such an update is marked by
+ * ext4_fc_stop_update().
+ *
+ * Fast Commit Ineligibility
+ * -------------------------
+ * Not all operations are supported by fast commits today (e.g extended
+ * attributes). Fast commit ineligiblity is marked by calling one of the
+ * two following functions:
+ *
+ * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
+ * back to full commit. This is useful in case of transient errors.
+ *
+ * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
+ * the fast commits happening between ext4_fc_start_ineligible() and
+ * ext4_fc_stop_ineligible() and one fast commit after the call to
+ * ext4_fc_stop_ineligible() to fall back to full commits. It is important to
+ * make one more fast commit to fall back to full commit after stop call so
+ * that it guaranteed that the fast commit ineligible operation contained
+ * within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
+ * followed by at least 1 full commit.
+ *
+ * Atomicity of commits
+ * --------------------
+ * In order to gaurantee atomicity during the commit operation, fast commit
+ * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
+ * tag contains CRC of the contents and TID of the transaction after which
+ * this fast commit should be applied. Recovery code replays fast commit
+ * logs only if there's at least 1 valid tail present. For every fast commit
+ * operation, there is 1 tail. This means, we may end up with multiple tails
+ * in the fast commit space. Here's an example:
+ *
+ * - Create a new file A and remove existing file B
+ * - fsync()
+ * - Append contents to file A
+ * - Truncate file A
+ * - fsync()
+ *
+ * The fast commit space at the end of above operations would look like this:
+ * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
+ * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->|
+ *
+ * Replay code should thus check for all the valid tails in the FC area.
+ *
+ * TODOs
+ * -----
+ * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
+ * eligible update must be protected within ext4_fc_start_update() and
+ * ext4_fc_stop_update(). These routines are called at much higher
+ * routines. This can be made more fine grained by combining with
+ * ext4_journal_start().
+ *
+ * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
+ *
+ * 3) Handle more ineligible cases.
+ */
+
+#include <trace/events/ext4.h>
+static struct kmem_cache *ext4_fc_dentry_cachep;
+
+static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
+{
+ BUFFER_TRACE(bh, "");
+ if (uptodate) {
+ ext4_debug("%s: Block %lld up-to-date",
+ __func__, bh->b_blocknr);
+ set_buffer_uptodate(bh);
+ } else {
+ ext4_debug("%s: Block %lld not up-to-date",
+ __func__, bh->b_blocknr);
+ clear_buffer_uptodate(bh);
+ }
+
+ unlock_buffer(bh);
+}
+
+static inline void ext4_fc_reset_inode(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ ei->i_fc_lblk_start = 0;
+ ei->i_fc_lblk_len = 0;
+}
+
+void ext4_fc_init_inode(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ ext4_fc_reset_inode(inode);
+ ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
+ INIT_LIST_HEAD(&ei->i_fc_list);
+ init_waitqueue_head(&ei->i_fc_wait);
+ atomic_set(&ei->i_fc_updates, 0);
+ ei->i_fc_committed_subtid = 0;
+}
+
+/*
+ * Inform Ext4's fast about start of an inode update
+ *
+ * This function is called by the high level call VFS callbacks before
+ * performing any inode update. This function blocks if there's an ongoing
+ * fast commit on the inode in question.
+ */
+void ext4_fc_start_update(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
+ (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
+ return;
+
+restart:
+ spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
+ if (list_empty(&ei->i_fc_list))
+ goto out;
+
+ if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
+ wait_queue_head_t *wq;
+#if (BITS_PER_LONG < 64)
+ DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
+ EXT4_STATE_FC_COMMITTING);
+ wq = bit_waitqueue(&ei->i_state_flags,
+ EXT4_STATE_FC_COMMITTING);
+#else
+ DEFINE_WAIT_BIT(wait, &ei->i_flags,
+ EXT4_STATE_FC_COMMITTING);
+ wq = bit_waitqueue(&ei->i_flags,
+ EXT4_STATE_FC_COMMITTING);
+#endif
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
+ schedule();
+ finish_wait(wq, &wait.wq_entry);
+ goto restart;
+ }
+out:
+ atomic_inc(&ei->i_fc_updates);
+ spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
+}
+
+/*
+ * Stop inode update and wake up waiting fast commits if any.
+ */
+void ext4_fc_stop_update(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
+ (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
+ return;
+
+ if (atomic_dec_and_test(&ei->i_fc_updates))
+ wake_up_all(&ei->i_fc_wait);
+}
+
+/*
+ * Remove inode from fast commit list. If the inode is being committed
+ * we wait until inode commit is done.
+ */
+void ext4_fc_del(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
+ (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
+ return;
+
+restart:
+ spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
+ if (list_empty(&ei->i_fc_list)) {
+ spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
+ return;
+ }
+
+ if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
+ wait_queue_head_t *wq;
+#if (BITS_PER_LONG < 64)
+ DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
+ EXT4_STATE_FC_COMMITTING);
+ wq = bit_waitqueue(&ei->i_state_flags,
+ EXT4_STATE_FC_COMMITTING);
+#else
+ DEFINE_WAIT_BIT(wait, &ei->i_flags,
+ EXT4_STATE_FC_COMMITTING);
+ wq = bit_waitqueue(&ei->i_flags,
+ EXT4_STATE_FC_COMMITTING);
+#endif
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
+ schedule();
+ finish_wait(wq, &wait.wq_entry);
+ goto restart;
+ }
+ if (!list_empty(&ei->i_fc_list))
+ list_del_init(&ei->i_fc_list);
+ spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
+}
+
+/*
+ * Mark file system as fast commit ineligible. This means that next commit
+ * operation would result in a full jbd2 commit.
+ */
+void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
+ (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
+ return;
+
+ sbi->s_mount_state |= EXT4_FC_INELIGIBLE;
+ WARN_ON(reason >= EXT4_FC_REASON_MAX);
+ sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
+}
+
+/*
+ * Start a fast commit ineligible update. Any commits that happen while
+ * such an operation is in progress fall back to full commits.
+ */
+void ext4_fc_start_ineligible(struct super_block *sb, int reason)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
+ (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
+ return;
+
+ WARN_ON(reason >= EXT4_FC_REASON_MAX);
+ sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
+ atomic_inc(&sbi->s_fc_ineligible_updates);
+}
+
+/*
+ * Stop a fast commit ineligible update. We set EXT4_FC_INELIGIBLE flag here
+ * to ensure that after stopping the ineligible update, at least one full
+ * commit takes place.
+ */
+void ext4_fc_stop_ineligible(struct super_block *sb)
+{
+ if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
+ (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
+ return;
+
+ EXT4_SB(sb)->s_mount_state |= EXT4_FC_INELIGIBLE;
+ atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
+}
+
+static inline int ext4_fc_is_ineligible(struct super_block *sb)
+{
+ return (EXT4_SB(sb)->s_mount_state & EXT4_FC_INELIGIBLE) ||
+ atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates);
+}
+
+/*
+ * Generic fast commit tracking function. If this is the first time this we are
+ * called after a full commit, we initialize fast commit fields and then call
+ * __fc_track_fn() with update = 0. If we have already been called after a full
+ * commit, we pass update = 1. Based on that, the track function can determine
+ * if it needs to track a field for the first time or if it needs to just
+ * update the previously tracked value.
+ *
+ * If enqueue is set, this function enqueues the inode in fast commit list.
+ */
+static int ext4_fc_track_template(
+ struct inode *inode, int (*__fc_track_fn)(struct inode *, void *, bool),
+ void *args, int enqueue)
+{
+ tid_t running_txn_tid;
+ bool update = false;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ int ret;
+
+ if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
+ (sbi->s_mount_state & EXT4_FC_REPLAY))
+ return -EOPNOTSUPP;
+
+ if (ext4_fc_is_ineligible(inode->i_sb))
+ return -EINVAL;
+
+ running_txn_tid = sbi->s_journal ?
+ sbi->s_journal->j_commit_sequence + 1 : 0;
+
+ mutex_lock(&ei->i_fc_lock);
+ if (running_txn_tid == ei->i_sync_tid) {
+ update = true;
+ } else {
+ ext4_fc_reset_inode(inode);
+ ei->i_sync_tid = running_txn_tid;
+ }
+ ret = __fc_track_fn(inode, args, update);
+ mutex_unlock(&ei->i_fc_lock);
+
+ if (!enqueue)
+ return ret;
+
+ spin_lock(&sbi->s_fc_lock);
+ if (list_empty(&EXT4_I(inode)->i_fc_list))
+ list_add_tail(&EXT4_I(inode)->i_fc_list,
+ (sbi->s_mount_state & EXT4_FC_COMMITTING) ?
+ &sbi->s_fc_q[FC_Q_STAGING] :
+ &sbi->s_fc_q[FC_Q_MAIN]);
+ spin_unlock(&sbi->s_fc_lock);
+
+ return ret;
+}
+
+struct __track_dentry_update_args {
+ struct dentry *dentry;
+ int op;
+};
+
+/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
+static int __track_dentry_update(struct inode *inode, void *arg, bool update)
+{
+ struct ext4_fc_dentry_update *node;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct __track_dentry_update_args *dentry_update =
+ (struct __track_dentry_update_args *)arg;
+ struct dentry *dentry = dentry_update->dentry;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ mutex_unlock(&ei->i_fc_lock);
+ node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
+ if (!node) {
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_MEM);
+ mutex_lock(&ei->i_fc_lock);
+ return -ENOMEM;
+ }
+
+ node->fcd_op = dentry_update->op;
+ node->fcd_parent = dentry->d_parent->d_inode->i_ino;
+ node->fcd_ino = inode->i_ino;
+ if (dentry->d_name.len > DNAME_INLINE_LEN) {
+ node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
+ if (!node->fcd_name.name) {
+ kmem_cache_free(ext4_fc_dentry_cachep, node);
+ ext4_fc_mark_ineligible(inode->i_sb,
+ EXT4_FC_REASON_MEM);
+ mutex_lock(&ei->i_fc_lock);
+ return -ENOMEM;
+ }
+ memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
+ dentry->d_name.len);
+ } else {
+ memcpy(node->fcd_iname, dentry->d_name.name,
+ dentry->d_name.len);
+ node->fcd_name.name = node->fcd_iname;
+ }
+ node->fcd_name.len = dentry->d_name.len;
+
+ spin_lock(&sbi->s_fc_lock);
+ if (sbi->s_mount_state & EXT4_FC_COMMITTING)
+ list_add_tail(&node->fcd_list,
+ &sbi->s_fc_dentry_q[FC_Q_STAGING]);
+ else
+ list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
+ spin_unlock(&sbi->s_fc_lock);
+ mutex_lock(&ei->i_fc_lock);
+
+ return 0;
+}
+
+void ext4_fc_track_unlink(struct inode *inode, struct dentry *dentry)
+{
+ struct __track_dentry_update_args args;
+ int ret;
+
+ args.dentry = dentry;
+ args.op = EXT4_FC_TAG_UNLINK;
+
+ ret = ext4_fc_track_template(inode, __track_dentry_update,
+ (void *)&args, 0);
+ trace_ext4_fc_track_unlink(inode, dentry, ret);
+}
+
+void ext4_fc_track_link(struct inode *inode, struct dentry *dentry)
+{
+ struct __track_dentry_update_args args;
+ int ret;
+
+ args.dentry = dentry;
+ args.op = EXT4_FC_TAG_LINK;
+
+ ret = ext4_fc_track_template(inode, __track_dentry_update,
+ (void *)&args, 0);
+ trace_ext4_fc_track_link(inode, dentry, ret);
+}
+
+void ext4_fc_track_create(struct inode *inode, struct dentry *dentry)
+{
+ struct __track_dentry_update_args args;
+ int ret;
+
+ args.dentry = dentry;
+ args.op = EXT4_FC_TAG_CREAT;
+
+ ret = ext4_fc_track_template(inode, __track_dentry_update,
+ (void *)&args, 0);
+ trace_ext4_fc_track_create(inode, dentry, ret);
+}
+
+/* __track_fn for inode tracking */
+static int __track_inode(struct inode *inode, void *arg, bool update)
+{
+ if (update)
+ return -EEXIST;
+
+ EXT4_I(inode)->i_fc_lblk_len = 0;
+
+ return 0;
+}
+
+void ext4_fc_track_inode(struct inode *inode)
+{
+ int ret;
+
+ if (S_ISDIR(inode->i_mode))
+ return;
+
+ ret = ext4_fc_track_template(inode, __track_inode, NULL, 1);
+ trace_ext4_fc_track_inode(inode, ret);
+}
+
+struct __track_range_args {
+ ext4_lblk_t start, end;
+};
+
+/* __track_fn for tracking data updates */
+static int __track_range(struct inode *inode, void *arg, bool update)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ ext4_lblk_t oldstart;
+ struct __track_range_args *__arg =
+ (struct __track_range_args *)arg;
+
+ if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
+ ext4_debug("Special inode %ld being modified\n", inode->i_ino);
+ return -ECANCELED;
+ }
+
+ oldstart = ei->i_fc_lblk_start;
+
+ if (update && ei->i_fc_lblk_len > 0) {
+ ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
+ ei->i_fc_lblk_len =
+ max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
+ ei->i_fc_lblk_start + 1;
+ } else {
+ ei->i_fc_lblk_start = __arg->start;
+ ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
+ }
+
+ return 0;
+}
+
+void ext4_fc_track_range(struct inode *inode, ext4_lblk_t start,
+ ext4_lblk_t end)
+{
+ struct __track_range_args args;
+ int ret;
+
+ if (S_ISDIR(inode->i_mode))
+ return;
+
+ args.start = start;
+ args.end = end;
+
+ ret = ext4_fc_track_template(inode, __track_range, &args, 1);
+
+ trace_ext4_fc_track_range(inode, start, end, ret);
+}
+
+static void ext4_fc_submit_bh(struct super_block *sb)
+{
+ int write_flags = REQ_SYNC;
+ struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
+
+ if (test_opt(sb, BARRIER))
+ write_flags |= REQ_FUA | REQ_PREFLUSH;
+ lock_buffer(bh);
+ clear_buffer_dirty(bh);
+ set_buffer_uptodate(bh);
+ bh->b_end_io = ext4_end_buffer_io_sync;
+ submit_bh(REQ_OP_WRITE, write_flags, bh);
+ EXT4_SB(sb)->s_fc_bh = NULL;
+}
+
+/* Ext4 commit path routines */
+
+/* memzero and update CRC */
+static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
+ u32 *crc)
+{
+ void *ret;
+
+ ret = memset(dst, 0, len);
+ if (crc)
+ *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
+ return ret;
+}
+
+/*
+ * Allocate len bytes on a fast commit buffer.
+ *
+ * During the commit time this function is used to manage fast commit
+ * block space. We don't split a fast commit log onto different
+ * blocks. So this function makes sure that if there's not enough space
+ * on the current block, the remaining space in the current block is
+ * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
+ * new block is from jbd2 and CRC is updated to reflect the padding
+ * we added.
+ */
+static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
+{
+ struct ext4_fc_tl *tl;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct buffer_head *bh;
+ int bsize = sbi->s_journal->j_blocksize;
+ int ret, off = sbi->s_fc_bytes % bsize;
+ int pad_len;
+
+ /*
+ * After allocating len, we should have space at least for a 0 byte
+ * padding.
+ */
+ if (len + sizeof(struct ext4_fc_tl) > bsize)
+ return NULL;
+
+ if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
+ /*
+ * Only allocate from current buffer if we have enough space for
+ * this request AND we have space to add a zero byte padding.
+ */
+ if (!sbi->s_fc_bh) {
+ ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
+ if (ret)
+ return NULL;
+ sbi->s_fc_bh = bh;
+ }
+ sbi->s_fc_bytes += len;
+ return sbi->s_fc_bh->b_data + off;
+ }
+ /* Need to add PAD tag */
+ tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
+ tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
+ pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
+ tl->fc_len = cpu_to_le16(pad_len);
+ if (crc)
+ *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
+ if (pad_len > 0)
+ ext4_fc_memzero(sb, tl + 1, pad_len, crc);
+ ext4_fc_submit_bh(sb);
+
+ ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
+ if (ret)
+ return NULL;
+ sbi->s_fc_bh = bh;
+ sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
+ return sbi->s_fc_bh->b_data;
+}
+
+/* memcpy to fc reserved space and update CRC */
+static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
+ int len, u32 *crc)
+{
+ if (crc)
+ *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
+ return memcpy(dst, src, len);
+}
+
+/*
+ * Complete a fast commit by writing tail tag.
+ *
+ * Writing tail tag marks the end of a fast commit. In order to guarantee
+ * atomicity, after writing tail tag, even if there's space remaining
+ * in the block, next commit shouldn't use it. That's why tail tag
+ * has the length as that of the remaining space on the block.
+ */
+static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_fc_tl tl;
+ struct ext4_fc_tail tail;
+ int off, bsize = sbi->s_journal->j_blocksize;
+ u8 *dst;
+
+ /*
+ * ext4_fc_reserve_space takes care of allocating an extra block if
+ * there's no enough space on this block for accommodating this tail.
+ */
+ dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
+ if (!dst)
+ return -ENOSPC;
+
+ off = sbi->s_fc_bytes % bsize;
+
+ tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
+ tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
+ sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
+
+ ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
+ dst += sizeof(tl);
+ tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
+ ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
+ dst += sizeof(tail.fc_tid);
+ tail.fc_crc = cpu_to_le32(crc);
+ ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
+
+ ext4_fc_submit_bh(sb);
+
+ return 0;
+}
+
+/*
+ * Adds tag, length, value and updates CRC. Returns true if tlv was added.
+ * Returns false if there's not enough space.
+ */
+static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
+ u32 *crc)
+{
+ struct ext4_fc_tl tl;
+ u8 *dst;
+
+ dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
+ if (!dst)
+ return false;
+
+ tl.fc_tag = cpu_to_le16(tag);
+ tl.fc_len = cpu_to_le16(len);
+
+ ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
+ ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
+
+ return true;
+}
+
+/* Same as above, but adds dentry tlv. */
+static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
+ int parent_ino, int ino, int dlen,
+ const unsigned char *dname,
+ u32 *crc)
+{
+ struct ext4_fc_dentry_info fcd;
+ struct ext4_fc_tl tl;
+ u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
+ crc);
+
+ if (!dst)
+ return false;
+
+ fcd.fc_parent_ino = cpu_to_le32(parent_ino);
+ fcd.fc_ino = cpu_to_le32(ino);
+ tl.fc_tag = cpu_to_le16(tag);
+ tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
+ ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
+ dst += sizeof(tl);
+ ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
+ dst += sizeof(fcd);
+ ext4_fc_memcpy(sb, dst, dname, dlen, crc);
+ dst += dlen;
+
+ return true;
+}
+
+/*
+ * Writes inode in the fast commit space under TLV with tag @tag.
+ * Returns 0 on success, error on failure.
+ */
+static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
+ int ret;
+ struct ext4_iloc iloc;
+ struct ext4_fc_inode fc_inode;
+ struct ext4_fc_tl tl;
+ u8 *dst;
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ return ret;
+
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
+ inode_len += ei->i_extra_isize;
+
+ fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
+ tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
+ tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
+
+ dst = ext4_fc_reserve_space(inode->i_sb,
+ sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
+ if (!dst)
+ return -ECANCELED;
+
+ if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
+ return -ECANCELED;
+ dst += sizeof(tl);
+ if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
+ return -ECANCELED;
+ dst += sizeof(fc_inode);
+ if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
+ inode_len, crc))
+ return -ECANCELED;
+
+ return 0;
+}
+
+/*
+ * Writes updated data ranges for the inode in question. Updates CRC.
+ * Returns 0 on success, error otherwise.
+ */
+static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
+{
+ ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_map_blocks map;
+ struct ext4_fc_add_range fc_ext;
+ struct ext4_fc_del_range lrange;
+ struct ext4_extent *ex;
+ int ret;
+
+ mutex_lock(&ei->i_fc_lock);
+ if (ei->i_fc_lblk_len == 0) {
+ mutex_unlock(&ei->i_fc_lock);
+ return 0;
+ }
+ old_blk_size = ei->i_fc_lblk_start;
+ new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
+ ei->i_fc_lblk_len = 0;
+ mutex_unlock(&ei->i_fc_lock);
+
+ cur_lblk_off = old_blk_size;
+ jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
+ __func__, cur_lblk_off, new_blk_size, inode->i_ino);
+
+ while (cur_lblk_off <= new_blk_size) {
+ map.m_lblk = cur_lblk_off;
+ map.m_len = new_blk_size - cur_lblk_off + 1;
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ return -ECANCELED;
+
+ if (map.m_len == 0) {
+ cur_lblk_off++;
+ continue;
+ }
+
+ if (ret == 0) {
+ lrange.fc_ino = cpu_to_le32(inode->i_ino);
+ lrange.fc_lblk = cpu_to_le32(map.m_lblk);
+ lrange.fc_len = cpu_to_le32(map.m_len);
+ if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
+ sizeof(lrange), (u8 *)&lrange, crc))
+ return -ENOSPC;
+ } else {
+ fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
+ ex = (struct ext4_extent *)&fc_ext.fc_ex;
+ ex->ee_block = cpu_to_le32(map.m_lblk);
+ ex->ee_len = cpu_to_le16(map.m_len);
+ ext4_ext_store_pblock(ex, map.m_pblk);
+ if (map.m_flags & EXT4_MAP_UNWRITTEN)
+ ext4_ext_mark_unwritten(ex);
+ else
+ ext4_ext_mark_initialized(ex);
+ if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
+ sizeof(fc_ext), (u8 *)&fc_ext, crc))
+ return -ENOSPC;
+ }
+
+ cur_lblk_off += map.m_len;
+ }
+
+ return 0;
+}
+
+
+/* Submit data for all the fast commit inodes */
+static int ext4_fc_submit_inode_data_all(journal_t *journal)
+{
+ struct super_block *sb = (struct super_block *)(journal->j_private);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_inode_info *ei;
+ struct list_head *pos;
+ int ret = 0;
+
+ spin_lock(&sbi->s_fc_lock);
+ sbi->s_mount_state |= EXT4_FC_COMMITTING;
+ list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
+ ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
+ ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
+ while (atomic_read(&ei->i_fc_updates)) {
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(&ei->i_fc_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ if (atomic_read(&ei->i_fc_updates)) {
+ spin_unlock(&sbi->s_fc_lock);
+ schedule();
+ spin_lock(&sbi->s_fc_lock);
+ }
+ finish_wait(&ei->i_fc_wait, &wait);
+ }
+ spin_unlock(&sbi->s_fc_lock);
+ ret = jbd2_submit_inode_data(ei->jinode);
+ if (ret)
+ return ret;
+ spin_lock(&sbi->s_fc_lock);
+ }
+ spin_unlock(&sbi->s_fc_lock);
+
+ return ret;
+}
+
+/* Wait for completion of data for all the fast commit inodes */
+static int ext4_fc_wait_inode_data_all(journal_t *journal)
+{
+ struct super_block *sb = (struct super_block *)(journal->j_private);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_inode_info *pos, *n;
+ int ret = 0;
+
+ spin_lock(&sbi->s_fc_lock);
+ list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+ if (!ext4_test_inode_state(&pos->vfs_inode,
+ EXT4_STATE_FC_COMMITTING))
+ continue;
+ spin_unlock(&sbi->s_fc_lock);
+
+ ret = jbd2_wait_inode_data(journal, pos->jinode);
+ if (ret)
+ return ret;
+ spin_lock(&sbi->s_fc_lock);
+ }
+ spin_unlock(&sbi->s_fc_lock);
+
+ return 0;
+}
+
+/* Commit all the directory entry updates */
+static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
+{
+ struct super_block *sb = (struct super_block *)(journal->j_private);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_fc_dentry_update *fc_dentry;
+ struct inode *inode;
+ struct list_head *pos, *n, *fcd_pos, *fcd_n;
+ struct ext4_inode_info *ei;
+ int ret;
+
+ if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
+ return 0;
+ list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
+ fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
+ fcd_list);
+ if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
+ spin_unlock(&sbi->s_fc_lock);
+ if (!ext4_fc_add_dentry_tlv(
+ sb, fc_dentry->fcd_op,
+ fc_dentry->fcd_parent, fc_dentry->fcd_ino,
+ fc_dentry->fcd_name.len,
+ fc_dentry->fcd_name.name, crc)) {
+ ret = -ENOSPC;
+ goto lock_and_exit;
+ }
+ spin_lock(&sbi->s_fc_lock);
+ continue;
+ }
+
+ inode = NULL;
+ list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
+ ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
+ if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
+ inode = &ei->vfs_inode;
+ break;
+ }
+ }
+ /*
+ * If we don't find inode in our list, then it was deleted,
+ * in which case, we don't need to record it's create tag.
+ */
+ if (!inode)
+ continue;
+ spin_unlock(&sbi->s_fc_lock);
+
+ /*
+ * We first write the inode and then the create dirent. This
+ * allows the recovery code to create an unnamed inode first
+ * and then link it to a directory entry. This allows us
+ * to use namei.c routines almost as is and simplifies
+ * the recovery code.
+ */
+ ret = ext4_fc_write_inode(inode, crc);
+ if (ret)
+ goto lock_and_exit;
+
+ ret = ext4_fc_write_inode_data(inode, crc);
+ if (ret)
+ goto lock_and_exit;
+
+ if (!ext4_fc_add_dentry_tlv(
+ sb, fc_dentry->fcd_op,
+ fc_dentry->fcd_parent, fc_dentry->fcd_ino,
+ fc_dentry->fcd_name.len,
+ fc_dentry->fcd_name.name, crc)) {
+ spin_lock(&sbi->s_fc_lock);
+ ret = -ENOSPC;
+ goto lock_and_exit;
+ }
+
+ spin_lock(&sbi->s_fc_lock);
+ }
+ return 0;
+lock_and_exit:
+ spin_lock(&sbi->s_fc_lock);
+ return ret;
+}
+
+static int ext4_fc_perform_commit(journal_t *journal)
+{
+ struct super_block *sb = (struct super_block *)(journal->j_private);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_inode_info *iter;
+ struct ext4_fc_head head;
+ struct list_head *pos;
+ struct inode *inode;
+ struct blk_plug plug;
+ int ret = 0;
+ u32 crc = 0;
+
+ ret = ext4_fc_submit_inode_data_all(journal);
+ if (ret)
+ return ret;
+
+ ret = ext4_fc_wait_inode_data_all(journal);
+ if (ret)
+ return ret;
+
+ blk_start_plug(&plug);
+ if (sbi->s_fc_bytes == 0) {
+ /*
+ * Add a head tag only if this is the first fast commit
+ * in this TID.
+ */
+ head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
+ head.fc_tid = cpu_to_le32(
+ sbi->s_journal->j_running_transaction->t_tid);
+ if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
+ (u8 *)&head, &crc))
+ goto out;
+ }
+
+ spin_lock(&sbi->s_fc_lock);
+ ret = ext4_fc_commit_dentry_updates(journal, &crc);
+ if (ret) {
+ spin_unlock(&sbi->s_fc_lock);
+ goto out;
+ }
+
+ list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
+ iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
+ inode = &iter->vfs_inode;
+ if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
+ continue;
+
+ spin_unlock(&sbi->s_fc_lock);
+ ret = ext4_fc_write_inode_data(inode, &crc);
+ if (ret)
+ goto out;
+ ret = ext4_fc_write_inode(inode, &crc);
+ if (ret)
+ goto out;
+ spin_lock(&sbi->s_fc_lock);
+ EXT4_I(inode)->i_fc_committed_subtid =
+ atomic_read(&sbi->s_fc_subtid);
+ }
+ spin_unlock(&sbi->s_fc_lock);
+
+ ret = ext4_fc_write_tail(sb, crc);
+
+out:
+ blk_finish_plug(&plug);
+ return ret;
+}
+
+/*
+ * The main commit entry point. Performs a fast commit for transaction
+ * commit_tid if needed. If it's not possible to perform a fast commit
+ * due to various reasons, we fall back to full commit. Returns 0
+ * on success, error otherwise.
+ */
+int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
+{
+ struct super_block *sb = (struct super_block *)(journal->j_private);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int nblks = 0, ret, bsize = journal->j_blocksize;
+ int subtid = atomic_read(&sbi->s_fc_subtid);
+ int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
+ ktime_t start_time, commit_time;
+
+ trace_ext4_fc_commit_start(sb);
+
+ start_time = ktime_get();
+
+ if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
+ (ext4_fc_is_ineligible(sb))) {
+ reason = EXT4_FC_REASON_INELIGIBLE;
+ goto out;
+ }
+
+restart_fc:
+ ret = jbd2_fc_begin_commit(journal, commit_tid);
+ if (ret == -EALREADY) {
+ /* There was an ongoing commit, check if we need to restart */
+ if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
+ commit_tid > journal->j_commit_sequence)
+ goto restart_fc;
+ reason = EXT4_FC_REASON_ALREADY_COMMITTED;
+ goto out;
+ } else if (ret) {
+ sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
+ reason = EXT4_FC_REASON_FC_START_FAILED;
+ goto out;
+ }
+
+ fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
+ ret = ext4_fc_perform_commit(journal);
+ if (ret < 0) {
+ sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
+ reason = EXT4_FC_REASON_FC_FAILED;
+ goto out;
+ }
+ nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
+ ret = jbd2_fc_wait_bufs(journal, nblks);
+ if (ret < 0) {
+ sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
+ reason = EXT4_FC_REASON_FC_FAILED;
+ goto out;
+ }
+ atomic_inc(&sbi->s_fc_subtid);
+ jbd2_fc_end_commit(journal);
+out:
+ /* Has any ineligible update happened since we started? */
+ if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
+ sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
+ reason = EXT4_FC_REASON_INELIGIBLE;
+ }
+
+ spin_lock(&sbi->s_fc_lock);
+ if (reason != EXT4_FC_REASON_OK &&
+ reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
+ sbi->s_fc_stats.fc_ineligible_commits++;
+ } else {
+ sbi->s_fc_stats.fc_num_commits++;
+ sbi->s_fc_stats.fc_numblks += nblks;
+ }
+ spin_unlock(&sbi->s_fc_lock);
+ nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
+ trace_ext4_fc_commit_stop(sb, nblks, reason);
+ commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+ /*
+ * weight the commit time higher than the average time so we don't
+ * react too strongly to vast changes in the commit time
+ */
+ if (likely(sbi->s_fc_avg_commit_time))
+ sbi->s_fc_avg_commit_time = (commit_time +
+ sbi->s_fc_avg_commit_time * 3) / 4;
+ else
+ sbi->s_fc_avg_commit_time = commit_time;
+ jbd_debug(1,
+ "Fast commit ended with blks = %d, reason = %d, subtid - %d",
+ nblks, reason, subtid);
+ if (reason == EXT4_FC_REASON_FC_FAILED)
+ return jbd2_fc_end_commit_fallback(journal, commit_tid);
+ if (reason == EXT4_FC_REASON_FC_START_FAILED ||
+ reason == EXT4_FC_REASON_INELIGIBLE)
+ return jbd2_complete_transaction(journal, commit_tid);
+ return 0;
+}
+
+/*
+ * Fast commit cleanup routine. This is called after every fast commit and
+ * full commit. full is true if we are called after a full commit.
+ */
+static void ext4_fc_cleanup(journal_t *journal, int full)
+{
+ struct super_block *sb = journal->j_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_inode_info *iter;
+ struct ext4_fc_dentry_update *fc_dentry;
+ struct list_head *pos, *n;
+
+ if (full && sbi->s_fc_bh)
+ sbi->s_fc_bh = NULL;
+
+ jbd2_fc_release_bufs(journal);
+
+ spin_lock(&sbi->s_fc_lock);
+ list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
+ iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
+ list_del_init(&iter->i_fc_list);
+ ext4_clear_inode_state(&iter->vfs_inode,
+ EXT4_STATE_FC_COMMITTING);
+ ext4_fc_reset_inode(&iter->vfs_inode);
+ /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
+ smp_mb();
+#if (BITS_PER_LONG < 64)
+ wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
+#else
+ wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
+#endif
+ }
+
+ while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
+ fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
+ struct ext4_fc_dentry_update,
+ fcd_list);
+ list_del_init(&fc_dentry->fcd_list);
+ spin_unlock(&sbi->s_fc_lock);
+
+ if (fc_dentry->fcd_name.name &&
+ fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
+ kfree(fc_dentry->fcd_name.name);
+ kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
+ spin_lock(&sbi->s_fc_lock);
+ }
+
+ list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
+ &sbi->s_fc_dentry_q[FC_Q_MAIN]);
+ list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
+ &sbi->s_fc_q[FC_Q_STAGING]);
+
+ sbi->s_mount_state &= ~EXT4_FC_COMMITTING;
+ sbi->s_mount_state &= ~EXT4_FC_INELIGIBLE;
+
+ if (full)
+ sbi->s_fc_bytes = 0;
+ spin_unlock(&sbi->s_fc_lock);
+ trace_ext4_fc_stats(sb);
+}
+
+/* Ext4 Replay Path Routines */
+
+/* Get length of a particular tlv */
+static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl)
+{
+ return le16_to_cpu(tl->fc_len);
+}
+
+/* Get a pointer to "value" of a tlv */
+static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl)
+{
+ return (u8 *)tl + sizeof(*tl);
+}
+
+/* Helper struct for dentry replay routines */
+struct dentry_info_args {
+ int parent_ino, dname_len, ino, inode_len;
+ char *dname;
+};
+
+static inline void tl_to_darg(struct dentry_info_args *darg,
+ struct ext4_fc_tl *tl)
+{
+ struct ext4_fc_dentry_info *fcd;
+
+ fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl);
+
+ darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino);
+ darg->ino = le32_to_cpu(fcd->fc_ino);
+ darg->dname = fcd->fc_dname;
+ darg->dname_len = ext4_fc_tag_len(tl) -
+ sizeof(struct ext4_fc_dentry_info);
+}
+
+/* Unlink replay function */
+static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl)
+{
+ struct inode *inode, *old_parent;
+ struct qstr entry;
+ struct dentry_info_args darg;
+ int ret = 0;
+
+ tl_to_darg(&darg, tl);
+
+ trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
+ darg.parent_ino, darg.dname_len);
+
+ entry.name = darg.dname;
+ entry.len = darg.dname_len;
+ inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
+
+ if (IS_ERR_OR_NULL(inode)) {
+ jbd_debug(1, "Inode %d not found", darg.ino);
+ return 0;
+ }
+
+ old_parent = ext4_iget(sb, darg.parent_ino,
+ EXT4_IGET_NORMAL);
+ if (IS_ERR_OR_NULL(old_parent)) {
+ jbd_debug(1, "Dir with inode %d not found", darg.parent_ino);
+ iput(inode);
+ return 0;
+ }
+
+ ret = __ext4_unlink(old_parent, &entry, inode);
+ /* -ENOENT ok coz it might not exist anymore. */
+ if (ret == -ENOENT)
+ ret = 0;
+ iput(old_parent);
+ iput(inode);
+ return ret;
+}
+
+static int ext4_fc_replay_link_internal(struct super_block *sb,
+ struct dentry_info_args *darg,
+ struct inode *inode)
+{
+ struct inode *dir = NULL;
+ struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
+ struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
+ int ret = 0;
+
+ dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
+ if (IS_ERR(dir)) {
+ jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
+ dir = NULL;
+ goto out;
+ }
+
+ dentry_dir = d_obtain_alias(dir);
+ if (IS_ERR(dentry_dir)) {
+ jbd_debug(1, "Failed to obtain dentry");
+ dentry_dir = NULL;
+ goto out;
+ }
+
+ dentry_inode = d_alloc(dentry_dir, &qstr_dname);
+ if (!dentry_inode) {
+ jbd_debug(1, "Inode dentry not created.");
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = __ext4_link(dir, inode, dentry_inode);
+ /*
+ * It's possible that link already existed since data blocks
+ * for the dir in question got persisted before we crashed OR
+ * we replayed this tag and crashed before the entire replay
+ * could complete.
+ */
+ if (ret && ret != -EEXIST) {
+ jbd_debug(1, "Failed to link\n");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (dentry_dir) {
+ d_drop(dentry_dir);
+ dput(dentry_dir);
+ } else if (dir) {
+ iput(dir);
+ }
+ if (dentry_inode) {
+ d_drop(dentry_inode);
+ dput(dentry_inode);
+ }
+
+ return ret;
+}
+
+/* Link replay function */
+static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl)
+{
+ struct inode *inode;
+ struct dentry_info_args darg;
+ int ret = 0;
+
+ tl_to_darg(&darg, tl);
+ trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
+ darg.parent_ino, darg.dname_len);
+
+ inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
+ if (IS_ERR_OR_NULL(inode)) {
+ jbd_debug(1, "Inode not found.");
+ return 0;
+ }
+
+ ret = ext4_fc_replay_link_internal(sb, &darg, inode);
+ iput(inode);
+ return ret;
+}
+
+/*
+ * Record all the modified inodes during replay. We use this later to setup
+ * block bitmaps correctly.
+ */
+static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
+{
+ struct ext4_fc_replay_state *state;
+ int i;
+
+ state = &EXT4_SB(sb)->s_fc_replay_state;
+ for (i = 0; i < state->fc_modified_inodes_used; i++)
+ if (state->fc_modified_inodes[i] == ino)
+ return 0;
+ if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
+ state->fc_modified_inodes_size +=
+ EXT4_FC_REPLAY_REALLOC_INCREMENT;
+ state->fc_modified_inodes = krealloc(
+ state->fc_modified_inodes, sizeof(int) *
+ state->fc_modified_inodes_size,
+ GFP_KERNEL);
+ if (!state->fc_modified_inodes)
+ return -ENOMEM;
+ }
+ state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
+ return 0;
+}
+
+/*
+ * Inode replay function
+ */
+static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
+{
+ struct ext4_fc_inode *fc_inode;
+ struct ext4_inode *raw_inode;
+ struct ext4_inode *raw_fc_inode;
+ struct inode *inode = NULL;
+ struct ext4_iloc iloc;
+ int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
+ struct ext4_extent_header *eh;
+
+ fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl);
+
+ ino = le32_to_cpu(fc_inode->fc_ino);
+ trace_ext4_fc_replay(sb, tag, ino, 0, 0);
+
+ inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
+ if (!IS_ERR_OR_NULL(inode)) {
+ ext4_ext_clear_bb(inode);
+ iput(inode);
+ }
+
+ ext4_fc_record_modified_inode(sb, ino);
+
+ raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode;
+ ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
+ if (ret)
+ goto out;
+
+ inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode);
+ raw_inode = ext4_raw_inode(&iloc);
+
+ memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
+ memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
+ inode_len - offsetof(struct ext4_inode, i_generation));
+ if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
+ eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
+ if (eh->eh_magic != EXT4_EXT_MAGIC) {
+ memset(eh, 0, sizeof(*eh));
+ eh->eh_magic = EXT4_EXT_MAGIC;
+ eh->eh_max = cpu_to_le16(
+ (sizeof(raw_inode->i_block) -
+ sizeof(struct ext4_extent_header))
+ / sizeof(struct ext4_extent));
+ }
+ } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
+ memcpy(raw_inode->i_block, raw_fc_inode->i_block,
+ sizeof(raw_inode->i_block));
+ }
+
+ /* Immediately update the inode on disk. */
+ ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
+ if (ret)
+ goto out;
+ ret = sync_dirty_buffer(iloc.bh);
+ if (ret)
+ goto out;
+ ret = ext4_mark_inode_used(sb, ino);
+ if (ret)
+ goto out;
+
+ /* Given that we just wrote the inode on disk, this SHOULD succeed. */
+ inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
+ if (IS_ERR_OR_NULL(inode)) {
+ jbd_debug(1, "Inode not found.");
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * Our allocator could have made different decisions than before
+ * crashing. This should be fixed but until then, we calculate
+ * the number of blocks the inode.
+ */
+ ext4_ext_replay_set_iblocks(inode);
+
+ inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
+ ext4_reset_inode_seed(inode);
+
+ ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
+ ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
+ sync_dirty_buffer(iloc.bh);
+ brelse(iloc.bh);
+out:
+ iput(inode);
+ if (!ret)
+ blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
+
+ return 0;
+}
+
+/*
+ * Dentry create replay function.
+ *
+ * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
+ * inode for which we are trying to create a dentry here, should already have
+ * been replayed before we start here.
+ */
+static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl)
+{
+ int ret = 0;
+ struct inode *inode = NULL;
+ struct inode *dir = NULL;
+ struct dentry_info_args darg;
+
+ tl_to_darg(&darg, tl);
+
+ trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
+ darg.parent_ino, darg.dname_len);
+
+ /* This takes care of update group descriptor and other metadata */
+ ret = ext4_mark_inode_used(sb, darg.ino);
+ if (ret)
+ goto out;
+
+ inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
+ if (IS_ERR_OR_NULL(inode)) {
+ jbd_debug(1, "inode %d not found.", darg.ino);
+ inode = NULL;
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (S_ISDIR(inode->i_mode)) {
+ /*
+ * If we are creating a directory, we need to make sure that the
+ * dot and dot dot dirents are setup properly.
+ */
+ dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
+ if (IS_ERR_OR_NULL(dir)) {
+ jbd_debug(1, "Dir %d not found.", darg.ino);
+ goto out;
+ }
+ ret = ext4_init_new_dir(NULL, dir, inode);
+ iput(dir);
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+ }
+ ret = ext4_fc_replay_link_internal(sb, &darg, inode);
+ if (ret)
+ goto out;
+ set_nlink(inode, 1);
+ ext4_mark_inode_dirty(NULL, inode);
+out:
+ if (inode)
+ iput(inode);
+ return ret;
+}
+
+/*
+ * Record physical disk regions which are in use as per fast commit area. Our
+ * simple replay phase allocator excludes these regions from allocation.
+ */
+static int ext4_fc_record_regions(struct super_block *sb, int ino,
+ ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
+{
+ struct ext4_fc_replay_state *state;
+ struct ext4_fc_alloc_region *region;
+
+ state = &EXT4_SB(sb)->s_fc_replay_state;
+ if (state->fc_regions_used == state->fc_regions_size) {
+ state->fc_regions_size +=
+ EXT4_FC_REPLAY_REALLOC_INCREMENT;
+ state->fc_regions = krealloc(
+ state->fc_regions,
+ state->fc_regions_size *
+ sizeof(struct ext4_fc_alloc_region),
+ GFP_KERNEL);
+ if (!state->fc_regions)
+ return -ENOMEM;
+ }
+ region = &state->fc_regions[state->fc_regions_used++];
+ region->ino = ino;
+ region->lblk = lblk;
+ region->pblk = pblk;
+ region->len = len;
+
+ return 0;
+}
+
+/* Replay add range tag */
+static int ext4_fc_replay_add_range(struct super_block *sb,
+ struct ext4_fc_tl *tl)
+{
+ struct ext4_fc_add_range *fc_add_ex;
+ struct ext4_extent newex, *ex;
+ struct inode *inode;
+ ext4_lblk_t start, cur;
+ int remaining, len;
+ ext4_fsblk_t start_pblk;
+ struct ext4_map_blocks map;
+ struct ext4_ext_path *path = NULL;
+ int ret;
+
+ fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
+ ex = (struct ext4_extent *)&fc_add_ex->fc_ex;
+
+ trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
+ le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block),
+ ext4_ext_get_actual_len(ex));
+
+ inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino),
+ EXT4_IGET_NORMAL);
+ if (IS_ERR_OR_NULL(inode)) {
+ jbd_debug(1, "Inode not found.");
+ return 0;
+ }
+
+ ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
+
+ start = le32_to_cpu(ex->ee_block);
+ start_pblk = ext4_ext_pblock(ex);
+ len = ext4_ext_get_actual_len(ex);
+
+ cur = start;
+ remaining = len;
+ jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
+ start, start_pblk, len, ext4_ext_is_unwritten(ex),
+ inode->i_ino);
+
+ while (remaining > 0) {
+ map.m_lblk = cur;
+ map.m_len = remaining;
+ map.m_pblk = 0;
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+
+ if (ret < 0) {
+ iput(inode);
+ return 0;
+ }
+
+ if (ret == 0) {
+ /* Range is not mapped */
+ path = ext4_find_extent(inode, cur, NULL, 0);
+ if (!path)
+ continue;
+ memset(&newex, 0, sizeof(newex));
+ newex.ee_block = cpu_to_le32(cur);
+ ext4_ext_store_pblock(
+ &newex, start_pblk + cur - start);
+ newex.ee_len = cpu_to_le16(map.m_len);
+ if (ext4_ext_is_unwritten(ex))
+ ext4_ext_mark_unwritten(&newex);
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_ext_insert_extent(
+ NULL, inode, &path, &newex, 0);
+ up_write((&EXT4_I(inode)->i_data_sem));
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ if (ret) {
+ iput(inode);
+ return 0;
+ }
+ goto next;
+ }
+
+ if (start_pblk + cur - start != map.m_pblk) {
+ /*
+ * Logical to physical mapping changed. This can happen
+ * if this range was removed and then reallocated to
+ * map to new physical blocks during a fast commit.
+ */
+ ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
+ ext4_ext_is_unwritten(ex),
+ start_pblk + cur - start);
+ if (ret) {
+ iput(inode);
+ return 0;
+ }
+ /*
+ * Mark the old blocks as free since they aren't used
+ * anymore. We maintain an array of all the modified
+ * inodes. In case these blocks are still used at either
+ * a different logical range in the same inode or in
+ * some different inode, we will mark them as allocated
+ * at the end of the FC replay using our array of
+ * modified inodes.
+ */
+ ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
+ goto next;
+ }
+
+ /* Range is mapped and needs a state change */
+ jbd_debug(1, "Converting from %d to %d %lld",
+ map.m_flags & EXT4_MAP_UNWRITTEN,
+ ext4_ext_is_unwritten(ex), map.m_pblk);
+ ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
+ ext4_ext_is_unwritten(ex), map.m_pblk);
+ if (ret) {
+ iput(inode);
+ return 0;
+ }
+ /*
+ * We may have split the extent tree while toggling the state.
+ * Try to shrink the extent tree now.
+ */
+ ext4_ext_replay_shrink_inode(inode, start + len);
+next:
+ cur += map.m_len;
+ remaining -= map.m_len;
+ }
+ ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
+ sb->s_blocksize_bits);
+ iput(inode);
+ return 0;
+}
+
+/* Replay DEL_RANGE tag */
+static int
+ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl)
+{
+ struct inode *inode;
+ struct ext4_fc_del_range *lrange;
+ struct ext4_map_blocks map;
+ ext4_lblk_t cur, remaining;
+ int ret;
+
+ lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl);
+ cur = le32_to_cpu(lrange->fc_lblk);
+ remaining = le32_to_cpu(lrange->fc_len);
+
+ trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
+ le32_to_cpu(lrange->fc_ino), cur, remaining);
+
+ inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL);
+ if (IS_ERR_OR_NULL(inode)) {
+ jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino));
+ return 0;
+ }
+
+ ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
+
+ jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
+ inode->i_ino, le32_to_cpu(lrange->fc_lblk),
+ le32_to_cpu(lrange->fc_len));
+ while (remaining > 0) {
+ map.m_lblk = cur;
+ map.m_len = remaining;
+
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0) {
+ iput(inode);
+ return 0;
+ }
+ if (ret > 0) {
+ remaining -= ret;
+ cur += ret;
+ ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
+ } else {
+ remaining -= map.m_len;
+ cur += map.m_len;
+ }
+ }
+
+ ret = ext4_punch_hole(inode,
+ le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits,
+ le32_to_cpu(lrange->fc_len) << sb->s_blocksize_bits);
+ if (ret)
+ jbd_debug(1, "ext4_punch_hole returned %d", ret);
+ ext4_ext_replay_shrink_inode(inode,
+ i_size_read(inode) >> sb->s_blocksize_bits);
+ ext4_mark_inode_dirty(NULL, inode);
+ iput(inode);
+
+ return 0;
+}
+
+static inline const char *tag2str(u16 tag)
+{
+ switch (tag) {
+ case EXT4_FC_TAG_LINK:
+ return "TAG_ADD_ENTRY";
+ case EXT4_FC_TAG_UNLINK:
+ return "TAG_DEL_ENTRY";
+ case EXT4_FC_TAG_ADD_RANGE:
+ return "TAG_ADD_RANGE";
+ case EXT4_FC_TAG_CREAT:
+ return "TAG_CREAT_DENTRY";
+ case EXT4_FC_TAG_DEL_RANGE:
+ return "TAG_DEL_RANGE";
+ case EXT4_FC_TAG_INODE:
+ return "TAG_INODE";
+ case EXT4_FC_TAG_PAD:
+ return "TAG_PAD";
+ case EXT4_FC_TAG_TAIL:
+ return "TAG_TAIL";
+ case EXT4_FC_TAG_HEAD:
+ return "TAG_HEAD";
+ default:
+ return "TAG_ERROR";
+ }
+}
+
+static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
+{
+ struct ext4_fc_replay_state *state;
+ struct inode *inode;
+ struct ext4_ext_path *path = NULL;
+ struct ext4_map_blocks map;
+ int i, ret, j;
+ ext4_lblk_t cur, end;
+
+ state = &EXT4_SB(sb)->s_fc_replay_state;
+ for (i = 0; i < state->fc_modified_inodes_used; i++) {
+ inode = ext4_iget(sb, state->fc_modified_inodes[i],
+ EXT4_IGET_NORMAL);
+ if (IS_ERR_OR_NULL(inode)) {
+ jbd_debug(1, "Inode %d not found.",
+ state->fc_modified_inodes[i]);
+ continue;
+ }
+ cur = 0;
+ end = EXT_MAX_BLOCKS;
+ while (cur < end) {
+ map.m_lblk = cur;
+ map.m_len = end - cur;
+
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ break;
+
+ if (ret > 0) {
+ path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
+ if (!IS_ERR_OR_NULL(path)) {
+ for (j = 0; j < path->p_depth; j++)
+ ext4_mb_mark_bb(inode->i_sb,
+ path[j].p_block, 1, 1);
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ }
+ cur += ret;
+ ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
+ map.m_len, 1);
+ } else {
+ cur = cur + (map.m_len ? map.m_len : 1);
+ }
+ }
+ iput(inode);
+ }
+}
+
+/*
+ * Check if block is in excluded regions for block allocation. The simple
+ * allocator that runs during replay phase is calls this function to see
+ * if it is okay to use a block.
+ */
+bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
+{
+ int i;
+ struct ext4_fc_replay_state *state;
+
+ state = &EXT4_SB(sb)->s_fc_replay_state;
+ for (i = 0; i < state->fc_regions_valid; i++) {
+ if (state->fc_regions[i].ino == 0 ||
+ state->fc_regions[i].len == 0)
+ continue;
+ if (blk >= state->fc_regions[i].pblk &&
+ blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
+ return true;
+ }
+ return false;
+}
+
+/* Cleanup function called after replay */
+void ext4_fc_replay_cleanup(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ sbi->s_mount_state &= ~EXT4_FC_REPLAY;
+ kfree(sbi->s_fc_replay_state.fc_regions);
+ kfree(sbi->s_fc_replay_state.fc_modified_inodes);
+}
+
+/*
+ * Recovery Scan phase handler
+ *
+ * This function is called during the scan phase and is responsible
+ * for doing following things:
+ * - Make sure the fast commit area has valid tags for replay
+ * - Count number of tags that need to be replayed by the replay handler
+ * - Verify CRC
+ * - Create a list of excluded blocks for allocation during replay phase
+ *
+ * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
+ * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
+ * to indicate that scan has finished and JBD2 can now start replay phase.
+ * It returns a negative error to indicate that there was an error. At the end
+ * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
+ * to indicate the number of tags that need to replayed during the replay phase.
+ */
+static int ext4_fc_replay_scan(journal_t *journal,
+ struct buffer_head *bh, int off,
+ tid_t expected_tid)
+{
+ struct super_block *sb = journal->j_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_fc_replay_state *state;
+ int ret = JBD2_FC_REPLAY_CONTINUE;
+ struct ext4_fc_add_range *ext;
+ struct ext4_fc_tl *tl;
+ struct ext4_fc_tail *tail;
+ __u8 *start, *end;
+ struct ext4_fc_head *head;
+ struct ext4_extent *ex;
+
+ state = &sbi->s_fc_replay_state;
+
+ start = (u8 *)bh->b_data;
+ end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
+
+ if (state->fc_replay_expected_off == 0) {
+ state->fc_cur_tag = 0;
+ state->fc_replay_num_tags = 0;
+ state->fc_crc = 0;
+ state->fc_regions = NULL;
+ state->fc_regions_valid = state->fc_regions_used =
+ state->fc_regions_size = 0;
+ /* Check if we can stop early */
+ if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
+ != EXT4_FC_TAG_HEAD)
+ return 0;
+ }
+
+ if (off != state->fc_replay_expected_off) {
+ ret = -EFSCORRUPTED;
+ goto out_err;
+ }
+
+ state->fc_replay_expected_off++;
+ fc_for_each_tl(start, end, tl) {
+ jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
+ tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr);
+ switch (le16_to_cpu(tl->fc_tag)) {
+ case EXT4_FC_TAG_ADD_RANGE:
+ ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
+ ex = (struct ext4_extent *)&ext->fc_ex;
+ ret = ext4_fc_record_regions(sb,
+ le32_to_cpu(ext->fc_ino),
+ le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
+ ext4_ext_get_actual_len(ex));
+ if (ret < 0)
+ break;
+ ret = JBD2_FC_REPLAY_CONTINUE;
+ fallthrough;
+ case EXT4_FC_TAG_DEL_RANGE:
+ case EXT4_FC_TAG_LINK:
+ case EXT4_FC_TAG_UNLINK:
+ case EXT4_FC_TAG_CREAT:
+ case EXT4_FC_TAG_INODE:
+ case EXT4_FC_TAG_PAD:
+ state->fc_cur_tag++;
+ state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
+ sizeof(*tl) + ext4_fc_tag_len(tl));
+ break;
+ case EXT4_FC_TAG_TAIL:
+ state->fc_cur_tag++;
+ tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
+ state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
+ sizeof(*tl) +
+ offsetof(struct ext4_fc_tail,
+ fc_crc));
+ if (le32_to_cpu(tail->fc_tid) == expected_tid &&
+ le32_to_cpu(tail->fc_crc) == state->fc_crc) {
+ state->fc_replay_num_tags = state->fc_cur_tag;
+ state->fc_regions_valid =
+ state->fc_regions_used;
+ } else {
+ ret = state->fc_replay_num_tags ?
+ JBD2_FC_REPLAY_STOP : -EFSBADCRC;
+ }
+ state->fc_crc = 0;
+ break;
+ case EXT4_FC_TAG_HEAD:
+ head = (struct ext4_fc_head *)ext4_fc_tag_val(tl);
+ if (le32_to_cpu(head->fc_features) &
+ ~EXT4_FC_SUPPORTED_FEATURES) {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+ if (le32_to_cpu(head->fc_tid) != expected_tid) {
+ ret = JBD2_FC_REPLAY_STOP;
+ break;
+ }
+ state->fc_cur_tag++;
+ state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
+ sizeof(*tl) + ext4_fc_tag_len(tl));
+ break;
+ default:
+ ret = state->fc_replay_num_tags ?
+ JBD2_FC_REPLAY_STOP : -ECANCELED;
+ }
+ if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
+ break;
+ }
+
+out_err:
+ trace_ext4_fc_replay_scan(sb, ret, off);
+ return ret;
+}
+
+/*
+ * Main recovery path entry point.
+ * The meaning of return codes is similar as above.
+ */
+static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
+ enum passtype pass, int off, tid_t expected_tid)
+{
+ struct super_block *sb = journal->j_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_fc_tl *tl;
+ __u8 *start, *end;
+ int ret = JBD2_FC_REPLAY_CONTINUE;
+ struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
+ struct ext4_fc_tail *tail;
+
+ if (pass == PASS_SCAN) {
+ state->fc_current_pass = PASS_SCAN;
+ return ext4_fc_replay_scan(journal, bh, off, expected_tid);
+ }
+
+ if (state->fc_current_pass != pass) {
+ state->fc_current_pass = pass;
+ sbi->s_mount_state |= EXT4_FC_REPLAY;
+ }
+ if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
+ jbd_debug(1, "Replay stops\n");
+ ext4_fc_set_bitmaps_and_counters(sb);
+ return 0;
+ }
+
+#ifdef CONFIG_EXT4_DEBUG
+ if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
+ pr_warn("Dropping fc block %d because max_replay set\n", off);
+ return JBD2_FC_REPLAY_STOP;
+ }
+#endif
+
+ start = (u8 *)bh->b_data;
+ end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
+
+ fc_for_each_tl(start, end, tl) {
+ if (state->fc_replay_num_tags == 0) {
+ ret = JBD2_FC_REPLAY_STOP;
+ ext4_fc_set_bitmaps_and_counters(sb);
+ break;
+ }
+ jbd_debug(3, "Replay phase, tag:%s\n",
+ tag2str(le16_to_cpu(tl->fc_tag)));
+ state->fc_replay_num_tags--;
+ switch (le16_to_cpu(tl->fc_tag)) {
+ case EXT4_FC_TAG_LINK:
+ ret = ext4_fc_replay_link(sb, tl);
+ break;
+ case EXT4_FC_TAG_UNLINK:
+ ret = ext4_fc_replay_unlink(sb, tl);
+ break;
+ case EXT4_FC_TAG_ADD_RANGE:
+ ret = ext4_fc_replay_add_range(sb, tl);
+ break;
+ case EXT4_FC_TAG_CREAT:
+ ret = ext4_fc_replay_create(sb, tl);
+ break;
+ case EXT4_FC_TAG_DEL_RANGE:
+ ret = ext4_fc_replay_del_range(sb, tl);
+ break;
+ case EXT4_FC_TAG_INODE:
+ ret = ext4_fc_replay_inode(sb, tl);
+ break;
+ case EXT4_FC_TAG_PAD:
+ trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
+ ext4_fc_tag_len(tl), 0);
+ break;
+ case EXT4_FC_TAG_TAIL:
+ trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
+ ext4_fc_tag_len(tl), 0);
+ tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
+ WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid);
+ break;
+ case EXT4_FC_TAG_HEAD:
+ break;
+ default:
+ trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0,
+ ext4_fc_tag_len(tl), 0);
+ ret = -ECANCELED;
+ break;
+ }
+ if (ret < 0)
+ break;
+ ret = JBD2_FC_REPLAY_CONTINUE;
+ }
+ return ret;
+}
+
+void ext4_fc_init(struct super_block *sb, journal_t *journal)
+{
+ /*
+ * We set replay callback even if fast commit disabled because we may
+ * could still have fast commit blocks that need to be replayed even if
+ * fast commit has now been turned off.
+ */
+ journal->j_fc_replay_callback = ext4_fc_replay;
+ if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
+ return;
+ journal->j_fc_cleanup_callback = ext4_fc_cleanup;
+ if (jbd2_fc_init(journal, EXT4_NUM_FC_BLKS)) {
+ pr_warn("Error while enabling fast commits, turning off.");
+ ext4_clear_feature_fast_commit(sb);
+ }
+}
+
+const char *fc_ineligible_reasons[] = {
+ "Extended attributes changed",
+ "Cross rename",
+ "Journal flag changed",
+ "Insufficient memory",
+ "Swap boot",
+ "Resize",
+ "Dir renamed",
+ "Falloc range op",
+ "FC Commit Failed"
+};
+
+int ext4_fc_info_show(struct seq_file *seq, void *v)
+{
+ struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
+ struct ext4_fc_stats *stats = &sbi->s_fc_stats;
+ int i;
+
+ if (v != SEQ_START_TOKEN)
+ return 0;
+
+ seq_printf(seq,
+ "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
+ stats->fc_num_commits, stats->fc_ineligible_commits,
+ stats->fc_numblks,
+ div_u64(sbi->s_fc_avg_commit_time, 1000));
+ seq_puts(seq, "Ineligible reasons:\n");
+ for (i = 0; i < EXT4_FC_REASON_MAX; i++)
+ seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
+ stats->fc_ineligible_reason_count[i]);
+
+ return 0;
+}
+
+int __init ext4_fc_init_dentry_cache(void)
+{
+ ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
+ SLAB_RECLAIM_ACCOUNT);
+
+ if (ext4_fc_dentry_cachep == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h
new file mode 100644
index 000000000000..06907d485989
--- /dev/null
+++ b/fs/ext4/fast_commit.h
@@ -0,0 +1,159 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __FAST_COMMIT_H__
+#define __FAST_COMMIT_H__
+
+/* Number of blocks in journal area to allocate for fast commits */
+#define EXT4_NUM_FC_BLKS 256
+
+/* Fast commit tags */
+#define EXT4_FC_TAG_ADD_RANGE 0x0001
+#define EXT4_FC_TAG_DEL_RANGE 0x0002
+#define EXT4_FC_TAG_CREAT 0x0003
+#define EXT4_FC_TAG_LINK 0x0004
+#define EXT4_FC_TAG_UNLINK 0x0005
+#define EXT4_FC_TAG_INODE 0x0006
+#define EXT4_FC_TAG_PAD 0x0007
+#define EXT4_FC_TAG_TAIL 0x0008
+#define EXT4_FC_TAG_HEAD 0x0009
+
+#define EXT4_FC_SUPPORTED_FEATURES 0x0
+
+/* On disk fast commit tlv value structures */
+
+/* Fast commit on disk tag length structure */
+struct ext4_fc_tl {
+ __le16 fc_tag;
+ __le16 fc_len;
+};
+
+/* Value structure for tag EXT4_FC_TAG_HEAD. */
+struct ext4_fc_head {
+ __le32 fc_features;
+ __le32 fc_tid;
+};
+
+/* Value structure for EXT4_FC_TAG_ADD_RANGE. */
+struct ext4_fc_add_range {
+ __le32 fc_ino;
+ __u8 fc_ex[12];
+};
+
+/* Value structure for tag EXT4_FC_TAG_DEL_RANGE. */
+struct ext4_fc_del_range {
+ __le32 fc_ino;
+ __le32 fc_lblk;
+ __le32 fc_len;
+};
+
+/*
+ * This is the value structure for tags EXT4_FC_TAG_CREAT, EXT4_FC_TAG_LINK
+ * and EXT4_FC_TAG_UNLINK.
+ */
+struct ext4_fc_dentry_info {
+ __le32 fc_parent_ino;
+ __le32 fc_ino;
+ u8 fc_dname[0];
+};
+
+/* Value structure for EXT4_FC_TAG_INODE and EXT4_FC_TAG_INODE_PARTIAL. */
+struct ext4_fc_inode {
+ __le32 fc_ino;
+ __u8 fc_raw_inode[0];
+};
+
+/* Value structure for tag EXT4_FC_TAG_TAIL. */
+struct ext4_fc_tail {
+ __le32 fc_tid;
+ __le32 fc_crc;
+};
+
+/*
+ * In memory list of dentry updates that are performed on the file
+ * system used by fast commit code.
+ */
+struct ext4_fc_dentry_update {
+ int fcd_op; /* Type of update create / unlink / link */
+ int fcd_parent; /* Parent inode number */
+ int fcd_ino; /* Inode number */
+ struct qstr fcd_name; /* Dirent name */
+ unsigned char fcd_iname[DNAME_INLINE_LEN]; /* Dirent name string */
+ struct list_head fcd_list;
+};
+
+/*
+ * Fast commit reason codes
+ */
+enum {
+ /*
+ * Commit status codes:
+ */
+ EXT4_FC_REASON_OK = 0,
+ EXT4_FC_REASON_INELIGIBLE,
+ EXT4_FC_REASON_ALREADY_COMMITTED,
+ EXT4_FC_REASON_FC_START_FAILED,
+ EXT4_FC_REASON_FC_FAILED,
+
+ /*
+ * Fast commit ineligiblity reasons:
+ */
+ EXT4_FC_REASON_XATTR = 0,
+ EXT4_FC_REASON_CROSS_RENAME,
+ EXT4_FC_REASON_JOURNAL_FLAG_CHANGE,
+ EXT4_FC_REASON_MEM,
+ EXT4_FC_REASON_SWAP_BOOT,
+ EXT4_FC_REASON_RESIZE,
+ EXT4_FC_REASON_RENAME_DIR,
+ EXT4_FC_REASON_FALLOC_RANGE,
+ EXT4_FC_COMMIT_FAILED,
+ EXT4_FC_REASON_MAX
+};
+
+struct ext4_fc_stats {
+ unsigned int fc_ineligible_reason_count[EXT4_FC_REASON_MAX];
+ unsigned long fc_num_commits;
+ unsigned long fc_ineligible_commits;
+ unsigned long fc_numblks;
+};
+
+#define EXT4_FC_REPLAY_REALLOC_INCREMENT 4
+
+/*
+ * Physical block regions added to different inodes due to fast commit
+ * recovery. These are set during the SCAN phase. During the replay phase,
+ * our allocator excludes these from its allocation. This ensures that
+ * we don't accidentally allocating a block that is going to be used by
+ * another inode.
+ */
+struct ext4_fc_alloc_region {
+ ext4_lblk_t lblk;
+ ext4_fsblk_t pblk;
+ int ino, len;
+};
+
+/*
+ * Fast commit replay state.
+ */
+struct ext4_fc_replay_state {
+ int fc_replay_num_tags;
+ int fc_replay_expected_off;
+ int fc_current_pass;
+ int fc_cur_tag;
+ int fc_crc;
+ struct ext4_fc_alloc_region *fc_regions;
+ int fc_regions_size, fc_regions_used, fc_regions_valid;
+ int *fc_modified_inodes;
+ int fc_modified_inodes_used, fc_modified_inodes_size;
+};
+
+#define region_last(__region) (((__region)->lblk) + ((__region)->len) - 1)
+
+#define fc_for_each_tl(__start, __end, __tl) \
+ for (tl = (struct ext4_fc_tl *)start; \
+ (u8 *)tl < (u8 *)end; \
+ tl = (struct ext4_fc_tl *)((u8 *)tl + \
+ sizeof(struct ext4_fc_tl) + \
+ + le16_to_cpu(tl->fc_len)))
+
+
+#endif /* __FAST_COMMIT_H__ */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 7d61069531d3..d85412d12e3a 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -260,6 +260,7 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
if (iocb->ki_flags & IOCB_NOWAIT)
return -EOPNOTSUPP;
+ ext4_fc_start_update(inode);
inode_lock(inode);
ret = ext4_write_checks(iocb, from);
if (ret <= 0)
@@ -271,6 +272,7 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
out:
inode_unlock(inode);
+ ext4_fc_stop_update(inode);
if (likely(ret > 0)) {
iocb->ki_pos += ret;
ret = generic_write_sync(iocb, ret);
@@ -534,7 +536,9 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
goto out;
}
+ ext4_fc_start_update(inode);
ret = ext4_orphan_add(handle, inode);
+ ext4_fc_stop_update(inode);
if (ret) {
ext4_journal_stop(handle);
goto out;
@@ -656,8 +660,8 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
#endif
if (iocb->ki_flags & IOCB_DIRECT)
return ext4_dio_write_iter(iocb, from);
-
- return ext4_buffered_write_iter(iocb, from);
+ else
+ return ext4_buffered_write_iter(iocb, from);
}
#ifdef CONFIG_FS_DAX
@@ -757,6 +761,7 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
if (!daxdev_mapping_supported(vma, dax_dev))
return -EOPNOTSUPP;
+ ext4_fc_start_update(inode);
file_accessed(file);
if (IS_DAX(file_inode(file))) {
vma->vm_ops = &ext4_dax_vm_ops;
@@ -764,6 +769,7 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
} else {
vma->vm_ops = &ext4_file_vm_ops;
}
+ ext4_fc_stop_update(inode);
return 0;
}
@@ -844,7 +850,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
return ret;
}
- filp->f_mode |= FMODE_NOWAIT;
+ filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
return dquot_file_open(inode, filp);
}
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index dbccf46f1770..b232c2767534 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -108,6 +108,9 @@ static int ext4_getfsmap_helper(struct super_block *sb,
/* Are we just counting mappings? */
if (info->gfi_head->fmh_count == 0) {
+ if (info->gfi_head->fmh_entries == UINT_MAX)
+ return EXT4_QUERY_RANGE_ABORT;
+
if (rec_fsblk > info->gfi_next_fsblk)
info->gfi_head->fmh_entries++;
@@ -571,8 +574,8 @@ static bool ext4_getfsmap_is_valid_device(struct super_block *sb,
if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX ||
fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev))
return true;
- if (EXT4_SB(sb)->journal_bdev &&
- fm->fmr_device == new_encode_dev(EXT4_SB(sb)->journal_bdev->bd_dev))
+ if (EXT4_SB(sb)->s_journal_bdev &&
+ fm->fmr_device == new_encode_dev(EXT4_SB(sb)->s_journal_bdev->bd_dev))
return true;
return false;
}
@@ -642,9 +645,9 @@ int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head,
memset(handlers, 0, sizeof(handlers));
handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev);
handlers[0].gfd_fn = ext4_getfsmap_datadev;
- if (EXT4_SB(sb)->journal_bdev) {
+ if (EXT4_SB(sb)->s_journal_bdev) {
handlers[1].gfd_dev = new_encode_dev(
- EXT4_SB(sb)->journal_bdev->bd_dev);
+ EXT4_SB(sb)->s_journal_bdev->bd_dev);
handlers[1].gfd_fn = ext4_getfsmap_logdev;
}
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 1d668c8f131f..81a545fd14a3 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -112,7 +112,7 @@ static int ext4_fsync_journal(struct inode *inode, bool datasync,
!jbd2_trans_will_send_data_barrier(journal, commit_tid))
*needs_barrier = true;
- return jbd2_complete_transaction(journal, commit_tid);
+ return ext4_fc_commit(journal, commit_tid);
}
/*
@@ -150,7 +150,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
ret = file_write_and_wait_range(file, start, end);
if (ret)
- return ret;
+ goto out;
/*
* data=writeback,ordered:
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 698ca4a4db5f..b215c564bc31 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -82,7 +82,12 @@ static int ext4_validate_inode_bitmap(struct super_block *sb,
struct buffer_head *bh)
{
ext4_fsblk_t blk;
- struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
+ struct ext4_group_info *grp;
+
+ if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
+ return 0;
+
+ grp = ext4_get_group_info(sb, block_group);
if (buffer_verified(bh))
return 0;
@@ -189,10 +194,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
* submit the buffer_head for reading
*/
trace_ext4_load_inode_bitmap(sb, block_group);
- bh->b_end_io = ext4_end_bitmap_read;
- get_bh(bh);
- submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
- wait_on_buffer(bh);
+ ext4_read_bh(bh, REQ_META | REQ_PRIO, ext4_end_bitmap_read);
ext4_simulate_fail_bh(sb, bh, EXT4_SIM_IBITMAP_EIO);
if (!buffer_uptodate(bh)) {
put_bh(bh);
@@ -284,15 +286,17 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
/* Don't bother if the inode bitmap is corrupt. */
- grp = ext4_get_group_info(sb, block_group);
if (IS_ERR(bitmap_bh)) {
fatal = PTR_ERR(bitmap_bh);
bitmap_bh = NULL;
goto error_return;
}
- if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) {
- fatal = -EFSCORRUPTED;
- goto error_return;
+ if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
+ grp = ext4_get_group_info(sb, block_group);
+ if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) {
+ fatal = -EFSCORRUPTED;
+ goto error_return;
+ }
}
BUFFER_TRACE(bitmap_bh, "get_write_access");
@@ -742,6 +746,122 @@ not_found:
return 1;
}
+int ext4_mark_inode_used(struct super_block *sb, int ino)
+{
+ unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
+ struct buffer_head *inode_bitmap_bh = NULL, *group_desc_bh = NULL;
+ struct ext4_group_desc *gdp;
+ ext4_group_t group;
+ int bit;
+ int err = -EFSCORRUPTED;
+
+ if (ino < EXT4_FIRST_INO(sb) || ino > max_ino)
+ goto out;
+
+ group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
+ bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
+ inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
+ if (IS_ERR(inode_bitmap_bh))
+ return PTR_ERR(inode_bitmap_bh);
+
+ if (ext4_test_bit(bit, inode_bitmap_bh->b_data)) {
+ err = 0;
+ goto out;
+ }
+
+ gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
+ if (!gdp || !group_desc_bh) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ ext4_set_bit(bit, inode_bitmap_bh->b_data);
+
+ BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(NULL, NULL, inode_bitmap_bh);
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
+ }
+ err = sync_dirty_buffer(inode_bitmap_bh);
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
+ }
+
+ /* We may have to initialize the block bitmap if it isn't already */
+ if (ext4_has_group_desc_csum(sb) &&
+ gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+ struct buffer_head *block_bitmap_bh;
+
+ block_bitmap_bh = ext4_read_block_bitmap(sb, group);
+ if (IS_ERR(block_bitmap_bh)) {
+ err = PTR_ERR(block_bitmap_bh);
+ goto out;
+ }
+
+ BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
+ err = ext4_handle_dirty_metadata(NULL, NULL, block_bitmap_bh);
+ sync_dirty_buffer(block_bitmap_bh);
+
+ /* recheck and clear flag under lock if we still need to */
+ ext4_lock_group(sb, group);
+ if (ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+ ext4_free_group_clusters_set(sb, gdp,
+ ext4_free_clusters_after_init(sb, group, gdp));
+ ext4_block_bitmap_csum_set(sb, group, gdp,
+ block_bitmap_bh);
+ ext4_group_desc_csum_set(sb, group, gdp);
+ }
+ ext4_unlock_group(sb, group);
+ brelse(block_bitmap_bh);
+
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
+ }
+ }
+
+ /* Update the relevant bg descriptor fields */
+ if (ext4_has_group_desc_csum(sb)) {
+ int free;
+
+ ext4_lock_group(sb, group); /* while we modify the bg desc */
+ free = EXT4_INODES_PER_GROUP(sb) -
+ ext4_itable_unused_count(sb, gdp);
+ if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
+ free = 0;
+ }
+
+ /*
+ * Check the relative inode number against the last used
+ * relative inode number in this group. if it is greater
+ * we need to update the bg_itable_unused count
+ */
+ if (bit >= free)
+ ext4_itable_unused_set(sb, gdp,
+ (EXT4_INODES_PER_GROUP(sb) - bit - 1));
+ } else {
+ ext4_lock_group(sb, group);
+ }
+
+ ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
+ if (ext4_has_group_desc_csum(sb)) {
+ ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh,
+ EXT4_INODES_PER_GROUP(sb) / 8);
+ ext4_group_desc_csum_set(sb, group, gdp);
+ }
+
+ ext4_unlock_group(sb, group);
+ err = ext4_handle_dirty_metadata(NULL, NULL, group_desc_bh);
+ sync_dirty_buffer(group_desc_bh);
+out:
+ return err;
+}
+
static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode,
bool encrypt)
{
@@ -818,7 +938,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
struct inode *ret;
ext4_group_t i;
ext4_group_t flex_group;
- struct ext4_group_info *grp;
+ struct ext4_group_info *grp = NULL;
bool encrypt = false;
/* Cannot create files in a deleted directory */
@@ -918,15 +1038,21 @@ got_group:
if (ext4_free_inodes_count(sb, gdp) == 0)
goto next_group;
- grp = ext4_get_group_info(sb, group);
- /* Skip groups with already-known suspicious inode tables */
- if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
- goto next_group;
+ if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
+ grp = ext4_get_group_info(sb, group);
+ /*
+ * Skip groups with already-known suspicious inode
+ * tables
+ */
+ if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
+ goto next_group;
+ }
brelse(inode_bitmap_bh);
inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
/* Skip groups with suspicious inode tables */
- if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) ||
+ if (((!(sbi->s_mount_state & EXT4_FC_REPLAY))
+ && EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) ||
IS_ERR(inode_bitmap_bh)) {
inode_bitmap_bh = NULL;
goto next_group;
@@ -945,7 +1071,7 @@ repeat_in_this_group:
goto next_group;
}
- if (!handle) {
+ if ((!(sbi->s_mount_state & EXT4_FC_REPLAY)) && !handle) {
BUG_ON(nblocks <= 0);
handle = __ext4_journal_start_sb(dir->i_sb, line_no,
handle_type, nblocks, 0,
@@ -1049,9 +1175,15 @@ got:
/* Update the relevant bg descriptor fields */
if (ext4_has_group_desc_csum(sb)) {
int free;
- struct ext4_group_info *grp = ext4_get_group_info(sb, group);
-
- down_read(&grp->alloc_sem); /* protect vs itable lazyinit */
+ struct ext4_group_info *grp = NULL;
+
+ if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
+ grp = ext4_get_group_info(sb, group);
+ down_read(&grp->alloc_sem); /*
+ * protect vs itable
+ * lazyinit
+ */
+ }
ext4_lock_group(sb, group); /* while we modify the bg desc */
free = EXT4_INODES_PER_GROUP(sb) -
ext4_itable_unused_count(sb, gdp);
@@ -1067,7 +1199,8 @@ got:
if (ino > free)
ext4_itable_unused_set(sb, gdp,
(EXT4_INODES_PER_GROUP(sb) - ino));
- up_read(&grp->alloc_sem);
+ if (!(sbi->s_mount_state & EXT4_FC_REPLAY))
+ up_read(&grp->alloc_sem);
} else {
ext4_lock_group(sb, group);
}
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 80c9f33800be..05efa682bc2f 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -163,7 +163,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
}
if (!bh_uptodate_or_lock(bh)) {
- if (bh_submit_read(bh) < 0) {
+ if (ext4_read_bh(bh, 0, NULL) < 0) {
put_bh(bh);
goto failure;
}
@@ -593,7 +593,8 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
if (ext4_has_feature_bigalloc(inode->i_sb)) {
EXT4_ERROR_INODE(inode, "Can't allocate blocks for "
"non-extent mapped inodes with bigalloc");
- return -EFSCORRUPTED;
+ err = -EFSCORRUPTED;
+ goto out;
}
/* Set up for the direct block allocation */
@@ -1012,14 +1013,14 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
}
/* Go read the buffer for the next level down */
- bh = sb_bread(inode->i_sb, nr);
+ bh = ext4_sb_bread(inode->i_sb, nr, 0);
/*
* A read failure? Report error and clear slot
* (should be rare).
*/
- if (!bh) {
- ext4_error_inode_block(inode, nr, EIO,
+ if (IS_ERR(bh)) {
+ ext4_error_inode_block(inode, nr, -PTR_ERR(bh),
"Read failure");
continue;
}
@@ -1033,7 +1034,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
brelse(bh);
/*
- * Everything below this this pointer has been
+ * Everything below this pointer has been
* released. Now let this top-of-subtree go.
*
* We want the freeing of this indirect block to be
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 75c97bca0815..caa51473207d 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -354,7 +354,7 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
if (error)
goto out;
- /* Update the xttr entry. */
+ /* Update the xattr entry. */
i.value = value;
i.value_len = len;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index bf596467c234..03c2253005f0 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -101,8 +101,8 @@ static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
return provided == calculated;
}
-static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
- struct ext4_inode_info *ei)
+void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
+ struct ext4_inode_info *ei)
{
__u32 csum;
@@ -514,7 +514,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
return -EFSCORRUPTED;
/* Lookup extent status tree firstly */
- if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
+ if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) &&
+ ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
map->m_pblk = ext4_es_pblock(&es) +
map->m_lblk - es.es_lblk;
@@ -729,6 +730,8 @@ out_sem:
if (ret)
return ret;
}
+ ext4_fc_track_range(inode, map->m_lblk,
+ map->m_lblk + map->m_len - 1);
}
if (retval < 0)
@@ -825,7 +828,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
int create = map_flags & EXT4_GET_BLOCKS_CREATE;
int err;
- J_ASSERT(handle != NULL || create == 0);
+ J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ || handle != NULL || create == 0);
map.m_lblk = block;
map.m_len = 1;
@@ -841,7 +845,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
return ERR_PTR(-ENOMEM);
if (map.m_flags & EXT4_MAP_NEW) {
J_ASSERT(create != 0);
- J_ASSERT(handle != NULL);
+ J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ || (handle != NULL));
/*
* Now that we do not always journal data, we should
@@ -878,18 +883,20 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
ext4_lblk_t block, int map_flags)
{
struct buffer_head *bh;
+ int ret;
bh = ext4_getblk(handle, inode, block, map_flags);
if (IS_ERR(bh))
return bh;
if (!bh || ext4_buffer_uptodate(bh))
return bh;
- ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh);
- wait_on_buffer(bh);
- if (buffer_uptodate(bh))
- return bh;
- put_bh(bh);
- return ERR_PTR(-EIO);
+
+ ret = ext4_read_bh_lock(bh, REQ_META | REQ_PRIO, true);
+ if (ret) {
+ put_bh(bh);
+ return ERR_PTR(ret);
+ }
+ return bh;
}
/* Read a contiguous batch of blocks. */
@@ -910,8 +917,7 @@ int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count,
for (i = 0; i < bh_count; i++)
/* Note that NULL bhs[i] is valid because of holes. */
if (bhs[i] && !ext4_buffer_uptodate(bhs[i]))
- ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1,
- &bhs[i]);
+ ext4_read_bh_lock(bhs[i], REQ_META | REQ_PRIO, false);
if (!wait)
return 0;
@@ -1081,7 +1087,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
!buffer_unwritten(bh) &&
(block_start < from || block_end > to)) {
- ll_rw_block(REQ_OP_READ, 0, 1, &bh);
+ ext4_read_bh_lock(bh, 0, false);
wait[nr_wait++] = bh;
}
}
@@ -1912,6 +1918,9 @@ static int __ext4_journalled_writepage(struct page *page,
}
if (ret == 0)
ret = err;
+ err = ext4_jbd2_inode_add_write(handle, inode, 0, len);
+ if (ret == 0)
+ ret = err;
EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
err = ext4_journal_stop(handle);
if (!ret)
@@ -2254,7 +2263,7 @@ static int mpage_process_page(struct mpage_da_data *mpd, struct page *page,
err = PTR_ERR(io_end_vec);
goto out;
}
- io_end_vec->offset = mpd->map.m_lblk << blkbits;
+ io_end_vec->offset = (loff_t)mpd->map.m_lblk << blkbits;
}
*map_bh = true;
goto out;
@@ -2785,7 +2794,7 @@ retry:
* ext4_journal_stop() can wait for transaction commit
* to finish which may depend on writeback of pages to
* complete or on page lock to be released. In that
- * case, we have to wait until after after we have
+ * case, we have to wait until after we have
* submitted all the IO, released page locks we hold,
* and dropped io_end reference (for extent conversion
* to be able to complete) before stopping the handle.
@@ -3296,9 +3305,14 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
{
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
- if (journal)
- return !jbd2_transaction_committed(journal,
- EXT4_I(inode)->i_datasync_tid);
+ if (journal) {
+ if (jbd2_transaction_committed(journal,
+ EXT4_I(inode)->i_datasync_tid))
+ return true;
+ return atomic_read(&EXT4_SB(inode->i_sb)->s_fc_subtid) >=
+ EXT4_I(inode)->i_fc_committed_subtid;
+ }
+
/* Any metadata buffers to write? */
if (!list_empty(&inode->i_mapping->private_list))
return true;
@@ -3436,14 +3450,26 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
- if (flags & IOMAP_WRITE)
+ if (flags & IOMAP_WRITE) {
+ /*
+ * We check here if the blocks are already allocated, then we
+ * don't need to start a journal txn and we can directly return
+ * the mapping information. This could boost performance
+ * especially in multi-threaded overwrite requests.
+ */
+ if (offset + length <= i_size_read(inode)) {
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED))
+ goto out;
+ }
ret = ext4_iomap_alloc(inode, &map, flags);
- else
+ } else {
ret = ext4_map_blocks(NULL, inode, &map, 0);
+ }
if (ret < 0)
return ret;
-
+out:
ext4_set_iomap(inode, iomap, &map, offset, length);
return 0;
@@ -3601,6 +3627,13 @@ static int ext4_set_page_dirty(struct page *page)
return __set_page_dirty_buffers(page);
}
+static int ext4_iomap_swap_activate(struct swap_info_struct *sis,
+ struct file *file, sector_t *span)
+{
+ return iomap_swapfile_activate(sis, file, span,
+ &ext4_iomap_report_ops);
+}
+
static const struct address_space_operations ext4_aops = {
.readpage = ext4_readpage,
.readahead = ext4_readahead,
@@ -3616,6 +3649,7 @@ static const struct address_space_operations ext4_aops = {
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
+ .swap_activate = ext4_iomap_swap_activate,
};
static const struct address_space_operations ext4_journalled_aops = {
@@ -3632,6 +3666,7 @@ static const struct address_space_operations ext4_journalled_aops = {
.direct_IO = noop_direct_IO,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
+ .swap_activate = ext4_iomap_swap_activate,
};
static const struct address_space_operations ext4_da_aops = {
@@ -3649,6 +3684,7 @@ static const struct address_space_operations ext4_da_aops = {
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
+ .swap_activate = ext4_iomap_swap_activate,
};
static const struct address_space_operations ext4_dax_aops = {
@@ -3657,6 +3693,7 @@ static const struct address_space_operations ext4_dax_aops = {
.set_page_dirty = noop_set_page_dirty,
.bmap = ext4_bmap,
.invalidatepage = noop_invalidatepage,
+ .swap_activate = ext4_iomap_swap_activate,
};
void ext4_set_aops(struct inode *inode)
@@ -3730,11 +3767,8 @@ static int __ext4_block_zero_page_range(handle_t *handle,
set_buffer_uptodate(bh);
if (!buffer_uptodate(bh)) {
- err = -EIO;
- ll_rw_block(REQ_OP_READ, 0, 1, &bh);
- wait_on_buffer(bh);
- /* Uhhuh. Read error. Complain and punt. */
- if (!buffer_uptodate(bh))
+ err = ext4_read_bh_lock(bh, 0, true);
+ if (err)
goto unlock;
if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
/* We expect the key to be set. */
@@ -4073,6 +4107,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
up_write(&EXT4_I(inode)->i_data_sem);
}
+ ext4_fc_track_range(inode, first_block, stop_block);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
@@ -4252,22 +4287,22 @@ out_trace:
* data in memory that is needed to recreate the on-disk version of this
* inode.
*/
-static int __ext4_get_inode_loc(struct inode *inode,
- struct ext4_iloc *iloc, int in_mem)
+static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
+ struct ext4_iloc *iloc, int in_mem,
+ ext4_fsblk_t *ret_block)
{
struct ext4_group_desc *gdp;
struct buffer_head *bh;
- struct super_block *sb = inode->i_sb;
ext4_fsblk_t block;
struct blk_plug plug;
int inodes_per_block, inode_offset;
iloc->bh = NULL;
- if (inode->i_ino < EXT4_ROOT_INO ||
- inode->i_ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
+ if (ino < EXT4_ROOT_INO ||
+ ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
return -EFSCORRUPTED;
- iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
+ iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
if (!gdp)
return -EIO;
@@ -4276,7 +4311,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
* Figure out the offset within the block group inode table
*/
inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
- inode_offset = ((inode->i_ino - 1) %
+ inode_offset = ((ino - 1) %
EXT4_INODES_PER_GROUP(sb));
block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
@@ -4289,16 +4324,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
if (!buffer_uptodate(bh)) {
lock_buffer(bh);
- /*
- * If the buffer has the write error flag, we have failed
- * to write out another inode in the same block. In this
- * case, we don't have to read the block because we may
- * read the old inode data successfully.
- */
- if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
- set_buffer_uptodate(bh);
-
- if (buffer_uptodate(bh)) {
+ if (ext4_buffer_uptodate(bh)) {
/* someone brought it uptodate while we waited */
unlock_buffer(bh);
goto has_buffer;
@@ -4369,7 +4395,7 @@ make_io:
if (end > table)
end = table;
while (b <= end)
- sb_breadahead_unmovable(sb, b++);
+ ext4_sb_breadahead_unmovable(sb, b++);
}
/*
@@ -4377,16 +4403,14 @@ make_io:
* has in-inode xattrs, or we don't have this inode in memory.
* Read the block from disk.
*/
- trace_ext4_load_inode(inode);
- get_bh(bh);
- bh->b_end_io = end_buffer_read_sync;
- submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
+ trace_ext4_load_inode(sb, ino);
+ ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL);
blk_finish_plug(&plug);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
simulate_eio:
- ext4_error_inode_block(inode, block, EIO,
- "unable to read itable block");
+ if (ret_block)
+ *ret_block = block;
brelse(bh);
return -EIO;
}
@@ -4396,11 +4420,43 @@ has_buffer:
return 0;
}
+static int __ext4_get_inode_loc_noinmem(struct inode *inode,
+ struct ext4_iloc *iloc)
+{
+ ext4_fsblk_t err_blk;
+ int ret;
+
+ ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, iloc, 0,
+ &err_blk);
+
+ if (ret == -EIO)
+ ext4_error_inode_block(inode, err_blk, EIO,
+ "unable to read itable block");
+
+ return ret;
+}
+
int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
{
+ ext4_fsblk_t err_blk;
+ int ret;
+
/* We have all inode data except xattrs in memory here. */
- return __ext4_get_inode_loc(inode, iloc,
- !ext4_test_inode_state(inode, EXT4_STATE_XATTR));
+ ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, iloc,
+ !ext4_test_inode_state(inode, EXT4_STATE_XATTR), &err_blk);
+
+ if (ret == -EIO)
+ ext4_error_inode_block(inode, err_blk, EIO,
+ "unable to read itable block");
+
+ return ret;
+}
+
+
+int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
+ struct ext4_iloc *iloc)
+{
+ return __ext4_get_inode_loc(sb, ino, iloc, 0, NULL);
}
static bool ext4_should_enable_dax(struct inode *inode)
@@ -4566,7 +4622,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ei = EXT4_I(inode);
iloc.bh = NULL;
- ret = __ext4_get_inode_loc(inode, &iloc, 0);
+ ret = __ext4_get_inode_loc_noinmem(inode, &iloc);
if (ret < 0)
goto bad_inode;
raw_inode = ext4_raw_inode(&iloc);
@@ -4612,10 +4668,11 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
sizeof(gen));
}
- if (!ext4_inode_csum_verify(inode, raw_inode, ei) ||
- ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) {
- ext4_error_inode_err(inode, function, line, 0, EFSBADCRC,
- "iget: checksum invalid");
+ if ((!ext4_inode_csum_verify(inode, raw_inode, ei) ||
+ ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) &&
+ (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))) {
+ ext4_error_inode_err(inode, function, line, 0,
+ EFSBADCRC, "iget: checksum invalid");
ret = -EFSBADCRC;
goto bad_inode;
}
@@ -4703,6 +4760,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
for (block = 0; block < EXT4_N_BLOCKS; block++)
ei->i_data[block] = raw_inode->i_block[block];
INIT_LIST_HEAD(&ei->i_orphan);
+ ext4_fc_init_inode(&ei->vfs_inode);
/*
* Set transaction id's of transactions that have to be committed
@@ -4768,9 +4826,10 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
goto bad_inode;
} else if (!ext4_has_inline_data(inode)) {
/* validate the block references in the inode */
- if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- (S_ISLNK(inode->i_mode) &&
- !ext4_inode_is_fast_symlink(inode))) {
+ if (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) &&
+ (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ (S_ISLNK(inode->i_mode) &&
+ !ext4_inode_is_fast_symlink(inode)))) {
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
ret = ext4_ext_check_inode(inode);
else
@@ -4971,6 +5030,12 @@ static int ext4_do_update_inode(handle_t *handle,
if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
+ err = ext4_inode_blocks_set(handle, raw_inode, ei);
+ if (err) {
+ spin_unlock(&ei->i_raw_lock);
+ goto out_brelse;
+ }
+
raw_inode->i_mode = cpu_to_le16(inode->i_mode);
i_uid = i_uid_read(inode);
i_gid = i_gid_read(inode);
@@ -5004,11 +5069,6 @@ static int ext4_do_update_inode(handle_t *handle,
EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
- err = ext4_inode_blocks_set(handle, raw_inode, ei);
- if (err) {
- spin_unlock(&ei->i_raw_lock);
- goto out_brelse;
- }
raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
@@ -5149,12 +5209,12 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
return 0;
- err = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
+ err = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
EXT4_I(inode)->i_sync_tid);
} else {
struct ext4_iloc iloc;
- err = __ext4_get_inode_loc(inode, &iloc, 0);
+ err = __ext4_get_inode_loc_noinmem(inode, &iloc);
if (err)
return err;
/*
@@ -5278,6 +5338,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (error)
return error;
}
+ ext4_fc_start_update(inode);
if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
(ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
handle_t *handle;
@@ -5301,6 +5362,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (error) {
ext4_journal_stop(handle);
+ ext4_fc_stop_update(inode);
return error;
}
/* Update corresponding info in inode so that everything is in
@@ -5323,11 +5385,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- if (attr->ia_size > sbi->s_bitmap_maxbytes)
+ if (attr->ia_size > sbi->s_bitmap_maxbytes) {
+ ext4_fc_stop_update(inode);
return -EFBIG;
+ }
}
- if (!S_ISREG(inode->i_mode))
+ if (!S_ISREG(inode->i_mode)) {
+ ext4_fc_stop_update(inode);
return -EINVAL;
+ }
if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
inode_inc_iversion(inode);
@@ -5351,7 +5417,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
rc = ext4_break_layouts(inode);
if (rc) {
up_write(&EXT4_I(inode)->i_mmap_sem);
- return rc;
+ goto err_out;
}
if (attr->ia_size != inode->i_size) {
@@ -5372,6 +5438,21 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
inode->i_mtime = current_time(inode);
inode->i_ctime = inode->i_mtime;
}
+
+ if (shrink)
+ ext4_fc_track_range(inode,
+ (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
+ inode->i_sb->s_blocksize_bits,
+ (oldsize > 0 ? oldsize - 1 : 0) >>
+ inode->i_sb->s_blocksize_bits);
+ else
+ ext4_fc_track_range(
+ inode,
+ (oldsize > 0 ? oldsize - 1 : oldsize) >>
+ inode->i_sb->s_blocksize_bits,
+ (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
+ inode->i_sb->s_blocksize_bits);
+
down_write(&EXT4_I(inode)->i_data_sem);
EXT4_I(inode)->i_disksize = attr->ia_size;
rc = ext4_mark_inode_dirty(handle, inode);
@@ -5430,9 +5511,11 @@ out_mmap_sem:
rc = posix_acl_chmod(inode, inode->i_mode);
err_out:
- ext4_std_error(inode->i_sb, error);
+ if (error)
+ ext4_std_error(inode->i_sb, error);
if (!error)
error = rc;
+ ext4_fc_stop_update(inode);
return error;
}
@@ -5614,6 +5697,8 @@ int ext4_mark_iloc_dirty(handle_t *handle,
put_bh(iloc->bh);
return -EIO;
}
+ ext4_fc_track_inode(inode);
+
if (IS_I_VERSION(inode))
inode_inc_iversion(inode);
@@ -5937,6 +6022,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
if (IS_ERR(handle))
return PTR_ERR(handle);
+ ext4_fc_mark_ineligible(inode->i_sb,
+ EXT4_FC_REASON_JOURNAL_FLAG_CHANGE);
err = ext4_mark_inode_dirty(handle, inode);
ext4_handle_sync(handle);
ext4_journal_stop(handle);
@@ -5977,9 +6064,17 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
if (err)
goto out_ret;
+ /*
+ * On data journalling we skip straight to the transaction handle:
+ * there's no delalloc; page truncated will be checked later; the
+ * early return w/ all buffers mapped (calculates size/len) can't
+ * be used; and there's no dioread_nolock, so only ext4_get_block.
+ */
+ if (ext4_should_journal_data(inode))
+ goto retry_alloc;
+
/* Delalloc case is easy... */
if (test_opt(inode->i_sb, DELALLOC) &&
- !ext4_should_journal_data(inode) &&
!ext4_nonda_switch(inode->i_sb)) {
do {
err = block_page_mkwrite(vma, vmf,
@@ -6005,6 +6100,9 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
/*
* Return if we have all the buffers mapped. This avoids the need to do
* journal_start/journal_stop which can block and take a long time
+ *
+ * This cannot be done for data journalling, as we have to add the
+ * inode to the transaction's list to writeprotect pages on commit.
*/
if (page_has_buffers(page)) {
if (!ext4_walk_page_buffers(NULL, page_buffers(page),
@@ -6029,16 +6127,42 @@ retry_alloc:
ret = VM_FAULT_SIGBUS;
goto out;
}
- err = block_page_mkwrite(vma, vmf, get_block);
- if (!err && ext4_should_journal_data(inode)) {
- if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
- PAGE_SIZE, NULL, do_journal_get_write_access)) {
- unlock_page(page);
+ /*
+ * Data journalling can't use block_page_mkwrite() because it
+ * will set_buffer_dirty() before do_journal_get_write_access()
+ * thus might hit warning messages for dirty metadata buffers.
+ */
+ if (!ext4_should_journal_data(inode)) {
+ err = block_page_mkwrite(vma, vmf, get_block);
+ } else {
+ lock_page(page);
+ size = i_size_read(inode);
+ /* Page got truncated from under us? */
+ if (page->mapping != mapping || page_offset(page) > size) {
+ ret = VM_FAULT_NOPAGE;
+ goto out_error;
+ }
+
+ if (page->index == size >> PAGE_SHIFT)
+ len = size & ~PAGE_MASK;
+ else
+ len = PAGE_SIZE;
+
+ err = __block_write_begin(page, 0, len, ext4_get_block);
+ if (!err) {
ret = VM_FAULT_SIGBUS;
- ext4_journal_stop(handle);
- goto out;
+ if (ext4_walk_page_buffers(handle, page_buffers(page),
+ 0, len, NULL, do_journal_get_write_access))
+ goto out_error;
+ if (ext4_walk_page_buffers(handle, page_buffers(page),
+ 0, len, NULL, write_end_fn))
+ goto out_error;
+ if (ext4_jbd2_inode_add_write(handle, inode, 0, len))
+ goto out_error;
+ ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+ } else {
+ unlock_page(page);
}
- ext4_set_inode_state(inode, EXT4_STATE_JDATA);
}
ext4_journal_stop(handle);
if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -6049,6 +6173,10 @@ out:
up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(inode->i_sb);
return ret;
+out_error:
+ unlock_page(page);
+ ext4_journal_stop(handle);
+ goto out;
}
vm_fault_t ext4_filemap_fault(struct vm_fault *vmf)
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 36eca3bc036a..f0381876a7e5 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -86,7 +86,7 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
i_size_write(inode2, isize);
}
-static void reset_inode_seed(struct inode *inode)
+void ext4_reset_inode_seed(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -165,6 +165,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
err = -EINVAL;
goto err_out;
}
+ ext4_fc_start_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT);
/* Protect extent tree against block allocations via delalloc */
ext4_double_down_write_data_sem(inode, inode_bl);
@@ -199,8 +200,8 @@ static long swap_inode_boot_loader(struct super_block *sb,
inode->i_generation = prandom_u32();
inode_bl->i_generation = prandom_u32();
- reset_inode_seed(inode);
- reset_inode_seed(inode_bl);
+ ext4_reset_inode_seed(inode);
+ ext4_reset_inode_seed(inode_bl);
ext4_discard_preallocations(inode, 0);
@@ -247,6 +248,7 @@ revert:
err_out1:
ext4_journal_stop(handle);
+ ext4_fc_stop_ineligible(sb);
ext4_double_up_write_data_sem(inode, inode_bl);
err_out:
@@ -807,7 +809,7 @@ static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg)
return error;
}
-long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
struct super_block *sb = inode->i_sb;
@@ -1074,6 +1076,7 @@ mext_out:
err = ext4_resize_fs(sb, n_blocks_count);
if (EXT4_SB(sb)->s_journal) {
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE);
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
@@ -1308,6 +1311,17 @@ out:
}
}
+long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ long ret;
+
+ ext4_fc_start_update(file_inode(filp));
+ ret = __ext4_ioctl(filp, cmd, arg);
+ ext4_fc_stop_update(file_inode(filp));
+
+ return ret;
+}
+
#ifdef CONFIG_COMPAT
long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 132c118d12e1..85abbfb98cbe 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -124,7 +124,7 @@
* /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
* terms of number of blocks. If we have mounted the file system with -O
* stripe=<value> option the group prealloc request is normalized to the
- * the smallest multiple of the stripe value (sbi->s_stripe) which is
+ * smallest multiple of the stripe value (sbi->s_stripe) which is
* greater than the default mb_group_prealloc.
*
* The regular allocator (using the buddy cache) supports a few tunables.
@@ -619,11 +619,8 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
void *buddy;
void *buddy2;
- {
- static int mb_check_counter;
- if (mb_check_counter++ % 100 != 0)
- return 0;
- }
+ if (e4b->bd_info->bb_check_counter++ % 10)
+ return 0;
while (order > 1) {
buddy = mb_find_buddy(e4b, order, &max);
@@ -1394,9 +1391,6 @@ void ext4_set_bits(void *bm, int cur, int len)
}
}
-/*
- * _________________________________________________________________ */
-
static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
{
if (mb_test_bit(*bit + side, bitmap)) {
@@ -1508,14 +1502,16 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
blocknr += EXT4_C2B(sbi, block);
- ext4_grp_locked_error(sb, e4b->bd_group,
- inode ? inode->i_ino : 0,
- blocknr,
- "freeing already freed block "
- "(bit %u); block bitmap corrupt.",
- block);
- ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
+ if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
+ ext4_grp_locked_error(sb, e4b->bd_group,
+ inode ? inode->i_ino : 0,
+ blocknr,
+ "freeing already freed block (bit %u); block bitmap corrupt.",
+ block);
+ ext4_mark_group_bitmap_corrupted(
+ sb, e4b->bd_group,
EXT4_GROUP_INFO_BBITMAP_CORRUPT);
+ }
mb_regenerate_buddy(e4b);
goto done;
}
@@ -2019,7 +2015,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
/*
* IF we have corrupt bitmap, we won't find any
* free blocks even though group info says we
- * we have free blocks
+ * have free blocks
*/
ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
"%d free clusters as per "
@@ -3303,6 +3299,84 @@ out_err:
}
/*
+ * Idempotent helper for Ext4 fast commit replay path to set the state of
+ * blocks in bitmaps and update counters.
+ */
+void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
+ int len, int state)
+{
+ struct buffer_head *bitmap_bh = NULL;
+ struct ext4_group_desc *gdp;
+ struct buffer_head *gdp_bh;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_group_t group;
+ ext4_grpblk_t blkoff;
+ int i, clen, err;
+ int already;
+
+ clen = EXT4_B2C(sbi, len);
+
+ ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
+ if (IS_ERR(bitmap_bh)) {
+ err = PTR_ERR(bitmap_bh);
+ bitmap_bh = NULL;
+ goto out_err;
+ }
+
+ err = -EIO;
+ gdp = ext4_get_group_desc(sb, group, &gdp_bh);
+ if (!gdp)
+ goto out_err;
+
+ ext4_lock_group(sb, group);
+ already = 0;
+ for (i = 0; i < clen; i++)
+ if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) == !state)
+ already++;
+
+ if (state)
+ ext4_set_bits(bitmap_bh->b_data, blkoff, clen);
+ else
+ mb_test_and_clear_bits(bitmap_bh->b_data, blkoff, clen);
+ if (ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+ ext4_free_group_clusters_set(sb, gdp,
+ ext4_free_clusters_after_init(sb,
+ group, gdp));
+ }
+ if (state)
+ clen = ext4_free_group_clusters(sb, gdp) - clen + already;
+ else
+ clen = ext4_free_group_clusters(sb, gdp) + clen - already;
+
+ ext4_free_group_clusters_set(sb, gdp, clen);
+ ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
+ ext4_group_desc_csum_set(sb, group, gdp);
+
+ ext4_unlock_group(sb, group);
+
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, group);
+
+ atomic64_sub(len,
+ &sbi_array_rcu_deref(sbi, s_flex_groups,
+ flex_group)->free_clusters);
+ }
+
+ err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
+ if (err)
+ goto out_err;
+ sync_dirty_buffer(bitmap_bh);
+ err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
+ sync_dirty_buffer(gdp_bh);
+
+out_err:
+ brelse(bitmap_bh);
+}
+
+/*
* here we normalize request for locality group
* Group request are normalized to s_mb_group_prealloc, which goes to
* s_strip if we set the same via mount option.
@@ -4160,7 +4234,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
struct ext4_buddy e4b;
int err;
int busy = 0;
- int free = 0;
+ int free, free_total = 0;
mb_debug(sb, "discard preallocation for group %u\n", group);
if (list_empty(&grp->bb_prealloc_list))
@@ -4188,8 +4262,8 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
INIT_LIST_HEAD(&list);
repeat:
+ free = 0;
ext4_lock_group(sb, group);
- this_cpu_inc(discard_pa_seq);
list_for_each_entry_safe(pa, tmp,
&grp->bb_prealloc_list, pa_group_list) {
spin_lock(&pa->pa_lock);
@@ -4206,6 +4280,9 @@ repeat:
/* seems this one can be freed ... */
ext4_mb_mark_pa_deleted(sb, pa);
+ if (!free)
+ this_cpu_inc(discard_pa_seq);
+
/* we can trust pa_free ... */
free += pa->pa_free;
@@ -4215,22 +4292,6 @@ repeat:
list_add(&pa->u.pa_tmp_list, &list);
}
- /* if we still need more blocks and some PAs were used, try again */
- if (free < needed && busy) {
- busy = 0;
- ext4_unlock_group(sb, group);
- cond_resched();
- goto repeat;
- }
-
- /* found anything to free? */
- if (list_empty(&list)) {
- BUG_ON(free != 0);
- mb_debug(sb, "Someone else may have freed PA for this group %u\n",
- group);
- goto out;
- }
-
/* now free all selected PAs */
list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
@@ -4248,14 +4309,22 @@ repeat:
call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
}
-out:
+ free_total += free;
+
+ /* if we still need more blocks and some PAs were used, try again */
+ if (free_total < needed && busy) {
+ ext4_unlock_group(sb, group);
+ cond_resched();
+ busy = 0;
+ goto repeat;
+ }
ext4_unlock_group(sb, group);
ext4_mb_unload_buddy(&e4b);
put_bh(bitmap_bh);
out_dbg:
mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n",
- free, group, grp->bb_free);
- return free;
+ free_total, group, grp->bb_free);
+ return free_total;
}
/*
@@ -4283,6 +4352,9 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
return;
}
+ if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
+ return;
+
mb_debug(sb, "discard preallocation for inode %lu\n",
inode->i_ino);
trace_ext4_discard_preallocations(inode,
@@ -4830,6 +4902,9 @@ out_dbg:
return ret;
}
+static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
+ struct ext4_allocation_request *ar, int *errp);
+
/*
* Main entry point into mballoc to allocate blocks
* it tries to use preallocation first, then falls back
@@ -4851,6 +4926,8 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
sbi = EXT4_SB(sb);
trace_ext4_request_blocks(ar);
+ if (sbi->s_mount_state & EXT4_FC_REPLAY)
+ return ext4_mb_new_blocks_simple(handle, ar, errp);
/* Allow to use superuser reservation for quota file */
if (ext4_is_quota_file(ar->inode))
@@ -5078,6 +5155,102 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
return 0;
}
+/*
+ * Simple allocator for Ext4 fast commit replay path. It searches for blocks
+ * linearly starting at the goal block and also excludes the blocks which
+ * are going to be in use after fast commit replay.
+ */
+static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
+ struct ext4_allocation_request *ar, int *errp)
+{
+ struct buffer_head *bitmap_bh;
+ struct super_block *sb = ar->inode->i_sb;
+ ext4_group_t group;
+ ext4_grpblk_t blkoff;
+ int i;
+ ext4_fsblk_t goal, block;
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+ goal = ar->goal;
+ if (goal < le32_to_cpu(es->s_first_data_block) ||
+ goal >= ext4_blocks_count(es))
+ goal = le32_to_cpu(es->s_first_data_block);
+
+ ar->len = 0;
+ ext4_get_group_no_and_offset(sb, goal, &group, &blkoff);
+ for (; group < ext4_get_groups_count(sb); group++) {
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
+ if (IS_ERR(bitmap_bh)) {
+ *errp = PTR_ERR(bitmap_bh);
+ pr_warn("Failed to read block bitmap\n");
+ return 0;
+ }
+
+ ext4_get_group_no_and_offset(sb,
+ max(ext4_group_first_block_no(sb, group), goal),
+ NULL, &blkoff);
+ i = mb_find_next_zero_bit(bitmap_bh->b_data, sb->s_blocksize,
+ blkoff);
+ brelse(bitmap_bh);
+ if (i >= sb->s_blocksize)
+ continue;
+ if (ext4_fc_replay_check_excluded(sb,
+ ext4_group_first_block_no(sb, group) + i))
+ continue;
+ break;
+ }
+
+ if (group >= ext4_get_groups_count(sb) && i >= sb->s_blocksize)
+ return 0;
+
+ block = ext4_group_first_block_no(sb, group) + i;
+ ext4_mb_mark_bb(sb, block, 1, 1);
+ ar->len = 1;
+
+ return block;
+}
+
+static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block,
+ unsigned long count)
+{
+ struct buffer_head *bitmap_bh;
+ struct super_block *sb = inode->i_sb;
+ struct ext4_group_desc *gdp;
+ struct buffer_head *gdp_bh;
+ ext4_group_t group;
+ ext4_grpblk_t blkoff;
+ int already_freed = 0, err, i;
+
+ ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
+ if (IS_ERR(bitmap_bh)) {
+ err = PTR_ERR(bitmap_bh);
+ pr_warn("Failed to read block bitmap\n");
+ return;
+ }
+ gdp = ext4_get_group_desc(sb, group, &gdp_bh);
+ if (!gdp)
+ return;
+
+ for (i = 0; i < count; i++) {
+ if (!mb_test_bit(blkoff + i, bitmap_bh->b_data))
+ already_freed++;
+ }
+ mb_clear_bits(bitmap_bh->b_data, blkoff, count);
+ err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
+ if (err)
+ return;
+ ext4_free_group_clusters_set(
+ sb, gdp, ext4_free_group_clusters(sb, gdp) +
+ count - already_freed);
+ ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
+ ext4_group_desc_csum_set(sb, group, gdp);
+ ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
+ sync_dirty_buffer(bitmap_bh);
+ sync_dirty_buffer(gdp_bh);
+ brelse(bitmap_bh);
+}
+
/**
* ext4_free_blocks() -- Free given blocks and update quota
* @handle: handle for this transaction
@@ -5104,6 +5277,13 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
int err = 0;
int ret;
+ sbi = EXT4_SB(sb);
+
+ if (sbi->s_mount_state & EXT4_FC_REPLAY) {
+ ext4_free_blocks_simple(inode, block, count);
+ return;
+ }
+
might_sleep();
if (bh) {
if (block)
@@ -5112,7 +5292,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
block = bh->b_blocknr;
}
- sbi = EXT4_SB(sb);
if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
!ext4_inode_block_valid(inode, block, count)) {
ext4_error(sb, "Freeing blocks not in datazone - "
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index d34cb8c46655..795c3ff2907c 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -85,15 +85,11 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
}
}
- get_bh(*bh);
lock_buffer(*bh);
- (*bh)->b_end_io = end_buffer_read_sync;
- submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, *bh);
- wait_on_buffer(*bh);
- if (!buffer_uptodate(*bh)) {
- ret = -EIO;
+ ret = ext4_read_bh(*bh, REQ_META | REQ_PRIO, NULL);
+ if (ret)
goto warn_exit;
- }
+
mmp = (struct mmp_struct *)((*bh)->b_data);
if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) {
ret = -EFSCORRUPTED;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 0d601b822875..64a579734f93 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -215,7 +215,7 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
for (i = 0; i < nr; i++) {
bh = arr[i];
if (!bh_uptodate_or_lock(bh)) {
- err = bh_submit_read(bh);
+ err = ext4_read_bh(bh, 0, NULL);
if (err)
return err;
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 0d74615fcce3..5159830dacb8 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2553,7 +2553,7 @@ out:
* for checking S_ISDIR(inode) (since the INODE_INDEX feature will not be set
* on regular files) and to avoid creating huge/slow non-HTREE directories.
*/
-static void ext4_inc_count(handle_t *handle, struct inode *inode)
+static void ext4_inc_count(struct inode *inode)
{
inc_nlink(inode);
if (is_dx(inode) &&
@@ -2565,7 +2565,7 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode)
* If a directory had nlink == 1, then we should let it be 1. This indicates
* directory has >EXT4_LINK_MAX subdirs.
*/
-static void ext4_dec_count(handle_t *handle, struct inode *inode)
+static void ext4_dec_count(struct inode *inode)
{
if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
drop_nlink(inode);
@@ -2610,7 +2610,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool excl)
{
handle_t *handle;
- struct inode *inode;
+ struct inode *inode, *inode_save;
int err, credits, retries = 0;
err = dquot_initialize(dir);
@@ -2628,7 +2628,11 @@ retry:
inode->i_op = &ext4_file_inode_operations;
inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
+ inode_save = inode;
+ ihold(inode_save);
err = ext4_add_nondir(handle, dentry, &inode);
+ ext4_fc_track_create(inode_save, dentry);
+ iput(inode_save);
}
if (handle)
ext4_journal_stop(handle);
@@ -2643,7 +2647,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t rdev)
{
handle_t *handle;
- struct inode *inode;
+ struct inode *inode, *inode_save;
int err, credits, retries = 0;
err = dquot_initialize(dir);
@@ -2660,7 +2664,12 @@ retry:
if (!IS_ERR(inode)) {
init_special_inode(inode, inode->i_mode, rdev);
inode->i_op = &ext4_special_inode_operations;
+ inode_save = inode;
+ ihold(inode_save);
err = ext4_add_nondir(handle, dentry, &inode);
+ if (!err)
+ ext4_fc_track_create(inode_save, dentry);
+ iput(inode_save);
}
if (handle)
ext4_journal_stop(handle);
@@ -2739,7 +2748,7 @@ struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
return ext4_next_entry(de, blocksize);
}
-static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
+int ext4_init_new_dir(handle_t *handle, struct inode *dir,
struct inode *inode)
{
struct buffer_head *dir_block = NULL;
@@ -2824,7 +2833,9 @@ out_clear_inode:
iput(inode);
goto out_retry;
}
- ext4_inc_count(handle, dir);
+ ext4_fc_track_create(inode, dentry);
+ ext4_inc_count(dir);
+
ext4_update_dx_flag(dir);
err = ext4_mark_inode_dirty(handle, dir);
if (err)
@@ -3162,8 +3173,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
retval = ext4_mark_inode_dirty(handle, inode);
if (retval)
goto end_rmdir;
- ext4_dec_count(handle, dir);
+ ext4_dec_count(dir);
ext4_update_dx_flag(dir);
+ ext4_fc_track_unlink(inode, dentry);
retval = ext4_mark_inode_dirty(handle, dir);
#ifdef CONFIG_UNICODE
@@ -3184,42 +3196,32 @@ end_rmdir:
return retval;
}
-static int ext4_unlink(struct inode *dir, struct dentry *dentry)
+int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
+ struct inode *inode)
{
- int retval;
- struct inode *inode;
+ int retval = -ENOENT;
struct buffer_head *bh;
struct ext4_dir_entry_2 *de;
handle_t *handle = NULL;
+ int skip_remove_dentry = 0;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
- return -EIO;
-
- trace_ext4_unlink_enter(dir, dentry);
- /* Initialize quotas before so that eventual writes go
- * in separate transaction */
- retval = dquot_initialize(dir);
- if (retval)
- goto out_trace;
- retval = dquot_initialize(d_inode(dentry));
- if (retval)
- goto out_trace;
-
- bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
- if (IS_ERR(bh)) {
- retval = PTR_ERR(bh);
- goto out_trace;
- }
- if (!bh) {
- retval = -ENOENT;
- goto out_trace;
- }
+ bh = ext4_find_entry(dir, d_name, &de, NULL);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
- inode = d_inode(dentry);
+ if (!bh)
+ return -ENOENT;
if (le32_to_cpu(de->inode) != inode->i_ino) {
- retval = -EFSCORRUPTED;
- goto out_bh;
+ /*
+ * It's okay if we find dont find dentry which matches
+ * the inode. That's because it might have gotten
+ * renamed to a different inode number
+ */
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ skip_remove_dentry = 1;
+ else
+ goto out_bh;
}
handle = ext4_journal_start(dir, EXT4_HT_DIR,
@@ -3232,17 +3234,21 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
- retval = ext4_delete_entry(handle, dir, de, bh);
- if (retval)
- goto out_handle;
- dir->i_ctime = dir->i_mtime = current_time(dir);
- ext4_update_dx_flag(dir);
- retval = ext4_mark_inode_dirty(handle, dir);
- if (retval)
- goto out_handle;
+ if (!skip_remove_dentry) {
+ retval = ext4_delete_entry(handle, dir, de, bh);
+ if (retval)
+ goto out_handle;
+ dir->i_ctime = dir->i_mtime = current_time(dir);
+ ext4_update_dx_flag(dir);
+ retval = ext4_mark_inode_dirty(handle, dir);
+ if (retval)
+ goto out_handle;
+ } else {
+ retval = 0;
+ }
if (inode->i_nlink == 0)
ext4_warning_inode(inode, "Deleting file '%.*s' with no links",
- dentry->d_name.len, dentry->d_name.name);
+ d_name->len, d_name->name);
else
drop_nlink(inode);
if (!inode->i_nlink)
@@ -3250,6 +3256,35 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
inode->i_ctime = current_time(inode);
retval = ext4_mark_inode_dirty(handle, inode);
+out_handle:
+ ext4_journal_stop(handle);
+out_bh:
+ brelse(bh);
+ return retval;
+}
+
+static int ext4_unlink(struct inode *dir, struct dentry *dentry)
+{
+ int retval;
+
+ if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
+ return -EIO;
+
+ trace_ext4_unlink_enter(dir, dentry);
+ /*
+ * Initialize quotas before so that eventual writes go
+ * in separate transaction
+ */
+ retval = dquot_initialize(dir);
+ if (retval)
+ goto out_trace;
+ retval = dquot_initialize(d_inode(dentry));
+ if (retval)
+ goto out_trace;
+
+ retval = __ext4_unlink(dir, &dentry->d_name, d_inode(dentry));
+ if (!retval)
+ ext4_fc_track_unlink(d_inode(dentry), dentry);
#ifdef CONFIG_UNICODE
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
@@ -3261,10 +3296,6 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
d_invalidate(dentry);
#endif
-out_handle:
- ext4_journal_stop(handle);
-out_bh:
- brelse(bh);
out_trace:
trace_ext4_unlink_exit(dentry, retval);
return retval;
@@ -3345,7 +3376,8 @@ static int ext4_symlink(struct inode *dir,
*/
drop_nlink(inode);
err = ext4_orphan_add(handle, inode);
- ext4_journal_stop(handle);
+ if (handle)
+ ext4_journal_stop(handle);
handle = NULL;
if (err)
goto err_drop_inode;
@@ -3399,29 +3431,10 @@ out_free_encrypted_link:
return err;
}
-static int ext4_link(struct dentry *old_dentry,
- struct inode *dir, struct dentry *dentry)
+int __ext4_link(struct inode *dir, struct inode *inode, struct dentry *dentry)
{
handle_t *handle;
- struct inode *inode = d_inode(old_dentry);
int err, retries = 0;
-
- if (inode->i_nlink >= EXT4_LINK_MAX)
- return -EMLINK;
-
- err = fscrypt_prepare_link(old_dentry, dir, dentry);
- if (err)
- return err;
-
- if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) &&
- (!projid_eq(EXT4_I(dir)->i_projid,
- EXT4_I(old_dentry->d_inode)->i_projid)))
- return -EXDEV;
-
- err = dquot_initialize(dir);
- if (err)
- return err;
-
retry:
handle = ext4_journal_start(dir, EXT4_HT_DIR,
(EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
@@ -3433,11 +3446,12 @@ retry:
ext4_handle_sync(handle);
inode->i_ctime = current_time(inode);
- ext4_inc_count(handle, inode);
+ ext4_inc_count(inode);
ihold(inode);
err = ext4_add_entry(handle, dentry, inode);
if (!err) {
+ ext4_fc_track_link(inode, dentry);
err = ext4_mark_inode_dirty(handle, inode);
/* this can happen only for tmpfile being
* linked the first time
@@ -3455,6 +3469,29 @@ retry:
return err;
}
+static int ext4_link(struct dentry *old_dentry,
+ struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = d_inode(old_dentry);
+ int err;
+
+ if (inode->i_nlink >= EXT4_LINK_MAX)
+ return -EMLINK;
+
+ err = fscrypt_prepare_link(old_dentry, dir, dentry);
+ if (err)
+ return err;
+
+ if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) &&
+ (!projid_eq(EXT4_I(dir)->i_projid,
+ EXT4_I(old_dentry->d_inode)->i_projid)))
+ return -EXDEV;
+
+ err = dquot_initialize(dir);
+ if (err)
+ return err;
+ return __ext4_link(dir, inode, dentry);
+}
/*
* Try to find buffer head where contains the parent block.
@@ -3630,9 +3667,9 @@ static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent)
{
if (ent->dir_nlink_delta) {
if (ent->dir_nlink_delta == -1)
- ext4_dec_count(handle, ent->dir);
+ ext4_dec_count(ent->dir);
else
- ext4_inc_count(handle, ent->dir);
+ ext4_inc_count(ent->dir);
ext4_mark_inode_dirty(handle, ent->dir);
}
}
@@ -3844,7 +3881,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
}
if (new.inode) {
- ext4_dec_count(handle, new.inode);
+ ext4_dec_count(new.inode);
new.inode->i_ctime = current_time(new.inode);
}
old.dir->i_ctime = old.dir->i_mtime = current_time(old.dir);
@@ -3854,14 +3891,14 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
if (retval)
goto end_rename;
- ext4_dec_count(handle, old.dir);
+ ext4_dec_count(old.dir);
if (new.inode) {
/* checked ext4_empty_dir above, can't have another
* parent, ext4_dec_count() won't work for many-linked
* dirs */
clear_nlink(new.inode);
} else {
- ext4_inc_count(handle, new.dir);
+ ext4_inc_count(new.dir);
ext4_update_dx_flag(new.dir);
retval = ext4_mark_inode_dirty(handle, new.dir);
if (unlikely(retval))
@@ -3871,6 +3908,22 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
retval = ext4_mark_inode_dirty(handle, old.dir);
if (unlikely(retval))
goto end_rename;
+
+ if (S_ISDIR(old.inode->i_mode)) {
+ /*
+ * We disable fast commits here that's because the
+ * replay code is not yet capable of changing dot dot
+ * dirents in directories.
+ */
+ ext4_fc_mark_ineligible(old.inode->i_sb,
+ EXT4_FC_REASON_RENAME_DIR);
+ } else {
+ if (new.inode)
+ ext4_fc_track_unlink(new.inode, new.dentry);
+ ext4_fc_track_link(old.inode, new.dentry);
+ ext4_fc_track_unlink(old.inode, old.dentry);
+ }
+
if (new.inode) {
retval = ext4_mark_inode_dirty(handle, new.inode);
if (unlikely(retval))
@@ -4014,7 +4067,8 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
retval = ext4_mark_inode_dirty(handle, new.inode);
if (unlikely(retval))
goto end_rename;
-
+ ext4_fc_mark_ineligible(new.inode->i_sb,
+ EXT4_FC_REASON_CROSS_RENAME);
if (old.dir_bh) {
retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
if (retval)
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index a50b51270ea9..928700d57eb6 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -843,8 +843,10 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
BUFFER_TRACE(dind, "get_write_access");
err = ext4_journal_get_write_access(handle, dind);
- if (unlikely(err))
+ if (unlikely(err)) {
ext4_std_error(sb, err);
+ goto errout;
+ }
/* ext4_reserve_inode_write() gets a reference on the iloc */
err = ext4_reserve_inode_write(handle, inode, &iloc);
@@ -1243,7 +1245,7 @@ static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block)
if (unlikely(!bh))
return NULL;
if (!bh_uptodate_or_lock(bh)) {
- if (bh_submit_read(bh) < 0) {
+ if (ext4_read_bh(bh, 0, NULL) < 0) {
brelse(bh);
return NULL;
}
@@ -1806,8 +1808,8 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
o_blocks_count + add, add);
/* See if the device is actually as big as what was requested */
- bh = sb_bread(sb, o_blocks_count + add - 1);
- if (!bh) {
+ bh = ext4_sb_bread(sb, o_blocks_count + add - 1, 0);
+ if (IS_ERR(bh)) {
ext4_warning(sb, "can't read last block, resize aborted");
return -ENOSPC;
}
@@ -1932,8 +1934,8 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
int meta_bg;
/* See if the device is actually as big as what was requested */
- bh = sb_bread(sb, n_blocks_count - 1);
- if (!bh) {
+ bh = ext4_sb_bread(sb, n_blocks_count - 1, 0);
+ if (IS_ERR(bh)) {
ext4_warning(sb, "can't read last block, resize aborted");
return -ENOSPC;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8b2736283481..2fe141ff3c7e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -141,27 +141,115 @@ MODULE_ALIAS_FS("ext3");
MODULE_ALIAS("ext3");
#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
+
+static inline void __ext4_read_bh(struct buffer_head *bh, int op_flags,
+ bh_end_io_t *end_io)
+{
+ /*
+ * buffer's verified bit is no longer valid after reading from
+ * disk again due to write out error, clear it to make sure we
+ * recheck the buffer contents.
+ */
+ clear_buffer_verified(bh);
+
+ bh->b_end_io = end_io ? end_io : end_buffer_read_sync;
+ get_bh(bh);
+ submit_bh(REQ_OP_READ, op_flags, bh);
+}
+
+void ext4_read_bh_nowait(struct buffer_head *bh, int op_flags,
+ bh_end_io_t *end_io)
+{
+ BUG_ON(!buffer_locked(bh));
+
+ if (ext4_buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ return;
+ }
+ __ext4_read_bh(bh, op_flags, end_io);
+}
+
+int ext4_read_bh(struct buffer_head *bh, int op_flags, bh_end_io_t *end_io)
+{
+ BUG_ON(!buffer_locked(bh));
+
+ if (ext4_buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ return 0;
+ }
+
+ __ext4_read_bh(bh, op_flags, end_io);
+
+ wait_on_buffer(bh);
+ if (buffer_uptodate(bh))
+ return 0;
+ return -EIO;
+}
+
+int ext4_read_bh_lock(struct buffer_head *bh, int op_flags, bool wait)
+{
+ if (trylock_buffer(bh)) {
+ if (wait)
+ return ext4_read_bh(bh, op_flags, NULL);
+ ext4_read_bh_nowait(bh, op_flags, NULL);
+ return 0;
+ }
+ if (wait) {
+ wait_on_buffer(bh);
+ if (buffer_uptodate(bh))
+ return 0;
+ return -EIO;
+ }
+ return 0;
+}
+
/*
- * This works like sb_bread() except it uses ERR_PTR for error
+ * This works like __bread_gfp() except it uses ERR_PTR for error
* returns. Currently with sb_bread it's impossible to distinguish
* between ENOMEM and EIO situations (since both result in a NULL
* return.
*/
-struct buffer_head *
-ext4_sb_bread(struct super_block *sb, sector_t block, int op_flags)
+static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb,
+ sector_t block, int op_flags,
+ gfp_t gfp)
{
- struct buffer_head *bh = sb_getblk(sb, block);
+ struct buffer_head *bh;
+ int ret;
+ bh = sb_getblk_gfp(sb, block, gfp);
if (bh == NULL)
return ERR_PTR(-ENOMEM);
if (ext4_buffer_uptodate(bh))
return bh;
- ll_rw_block(REQ_OP_READ, REQ_META | op_flags, 1, &bh);
- wait_on_buffer(bh);
- if (buffer_uptodate(bh))
- return bh;
- put_bh(bh);
- return ERR_PTR(-EIO);
+
+ ret = ext4_read_bh_lock(bh, REQ_META | op_flags, true);
+ if (ret) {
+ put_bh(bh);
+ return ERR_PTR(ret);
+ }
+ return bh;
+}
+
+struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block,
+ int op_flags)
+{
+ return __ext4_sb_bread_gfp(sb, block, op_flags, __GFP_MOVABLE);
+}
+
+struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
+ sector_t block)
+{
+ return __ext4_sb_bread_gfp(sb, block, 0, 0);
+}
+
+void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block)
+{
+ struct buffer_head *bh = sb_getblk_gfp(sb, block, 0);
+
+ if (likely(bh)) {
+ ext4_read_bh_lock(bh, REQ_RAHEAD, false);
+ brelse(bh);
+ }
}
static int ext4_verify_csum_type(struct super_block *sb,
@@ -201,7 +289,18 @@ void ext4_superblock_csum_set(struct super_block *sb)
if (!ext4_has_metadata_csum(sb))
return;
+ /*
+ * Locking the superblock prevents the scenario
+ * where:
+ * 1) a first thread pauses during checksum calculation.
+ * 2) a second thread updates the superblock, recalculates
+ * the checksum, and updates s_checksum
+ * 3) the first thread resumes and finishes its checksum calculation
+ * and updates s_checksum with a potentially stale or torn value.
+ */
+ lock_buffer(EXT4_SB(sb)->s_sbh);
es->s_checksum = ext4_superblock_csum(sb, es);
+ unlock_buffer(EXT4_SB(sb)->s_sbh);
}
ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
@@ -472,6 +571,89 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
spin_unlock(&sbi->s_md_lock);
}
+/*
+ * This writepage callback for write_cache_pages()
+ * takes care of a few cases after page cleaning.
+ *
+ * write_cache_pages() already checks for dirty pages
+ * and calls clear_page_dirty_for_io(), which we want,
+ * to write protect the pages.
+ *
+ * However, we may have to redirty a page (see below.)
+ */
+static int ext4_journalled_writepage_callback(struct page *page,
+ struct writeback_control *wbc,
+ void *data)
+{
+ transaction_t *transaction = (transaction_t *) data;
+ struct buffer_head *bh, *head;
+ struct journal_head *jh;
+
+ bh = head = page_buffers(page);
+ do {
+ /*
+ * We have to redirty a page in these cases:
+ * 1) If buffer is dirty, it means the page was dirty because it
+ * contains a buffer that needs checkpointing. So the dirty bit
+ * needs to be preserved so that checkpointing writes the buffer
+ * properly.
+ * 2) If buffer is not part of the committing transaction
+ * (we may have just accidentally come across this buffer because
+ * inode range tracking is not exact) or if the currently running
+ * transaction already contains this buffer as well, dirty bit
+ * needs to be preserved so that the buffer gets writeprotected
+ * properly on running transaction's commit.
+ */
+ jh = bh2jh(bh);
+ if (buffer_dirty(bh) ||
+ (jh && (jh->b_transaction != transaction ||
+ jh->b_next_transaction))) {
+ redirty_page_for_writepage(wbc, page);
+ goto out;
+ }
+ } while ((bh = bh->b_this_page) != head);
+
+out:
+ return AOP_WRITEPAGE_ACTIVATE;
+}
+
+static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode)
+{
+ struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = LONG_MAX,
+ .range_start = jinode->i_dirty_start,
+ .range_end = jinode->i_dirty_end,
+ };
+
+ return write_cache_pages(mapping, &wbc,
+ ext4_journalled_writepage_callback,
+ jinode->i_transaction);
+}
+
+static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
+{
+ int ret;
+
+ if (ext4_should_journal_data(jinode->i_vfs_inode))
+ ret = ext4_journalled_submit_inode_data_buffers(jinode);
+ else
+ ret = jbd2_journal_submit_inode_data_buffers(jinode);
+
+ return ret;
+}
+
+static int ext4_journal_finish_inode_data_buffers(struct jbd2_inode *jinode)
+{
+ int ret = 0;
+
+ if (!ext4_should_journal_data(jinode->i_vfs_inode))
+ ret = jbd2_journal_finish_inode_data_buffers(jinode);
+
+ return ret;
+}
+
static bool system_going_down(void)
{
return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF
@@ -939,10 +1121,10 @@ static void ext4_blkdev_put(struct block_device *bdev)
static void ext4_blkdev_remove(struct ext4_sb_info *sbi)
{
struct block_device *bdev;
- bdev = sbi->journal_bdev;
+ bdev = sbi->s_journal_bdev;
if (bdev) {
ext4_blkdev_put(bdev);
- sbi->journal_bdev = NULL;
+ sbi->s_journal_bdev = NULL;
}
}
@@ -1073,14 +1255,14 @@ static void ext4_put_super(struct super_block *sb)
sync_blockdev(sb->s_bdev);
invalidate_bdev(sb->s_bdev);
- if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
+ if (sbi->s_journal_bdev && sbi->s_journal_bdev != sb->s_bdev) {
/*
* Invalidate the journal device's buffers. We don't want them
* floating about in memory - the physical journal device may
* hotswapped, and it breaks the `ro-after' testing code.
*/
- sync_blockdev(sbi->journal_bdev);
- invalidate_bdev(sbi->journal_bdev);
+ sync_blockdev(sbi->s_journal_bdev);
+ invalidate_bdev(sbi->s_journal_bdev);
ext4_blkdev_remove(sbi);
}
@@ -1149,6 +1331,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->i_datasync_tid = 0;
atomic_set(&ei->i_unwritten, 0);
INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
+ ext4_fc_init_inode(&ei->vfs_inode);
+ mutex_init(&ei->i_fc_lock);
return &ei->vfs_inode;
}
@@ -1166,6 +1350,10 @@ static int ext4_drop_inode(struct inode *inode)
static void ext4_free_in_core_inode(struct inode *inode)
{
fscrypt_free_inode(inode);
+ if (!list_empty(&(EXT4_I(inode)->i_fc_list))) {
+ pr_warn("%s: inode %ld still in fc list",
+ __func__, inode->i_ino);
+ }
kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
}
@@ -1191,6 +1379,7 @@ static void init_once(void *foo)
init_rwsem(&ei->i_data_sem);
init_rwsem(&ei->i_mmap_sem);
inode_init_once(&ei->vfs_inode);
+ ext4_fc_init_inode(&ei->vfs_inode);
}
static int __init init_inodecache(void)
@@ -1219,6 +1408,7 @@ static void destroy_inodecache(void)
void ext4_clear_inode(struct inode *inode)
{
+ ext4_fc_del(inode);
invalidate_inode_buffers(inode);
clear_inode(inode);
ext4_discard_preallocations(inode, 0);
@@ -1526,7 +1716,11 @@ enum {
Opt_dioread_nolock, Opt_dioread_lock,
Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
- Opt_prefetch_block_bitmaps,
+ Opt_prefetch_block_bitmaps, Opt_no_fc,
+#ifdef CONFIG_EXT4_DEBUG
+ Opt_fc_debug_max_replay,
+#endif
+ Opt_fc_debug_force
};
static const match_table_t tokens = {
@@ -1613,6 +1807,11 @@ static const match_table_t tokens = {
{Opt_init_itable, "init_itable=%u"},
{Opt_init_itable, "init_itable"},
{Opt_noinit_itable, "noinit_itable"},
+ {Opt_no_fc, "no_fc"},
+ {Opt_fc_debug_force, "fc_debug_force"},
+#ifdef CONFIG_EXT4_DEBUG
+ {Opt_fc_debug_max_replay, "fc_debug_max_replay=%u"},
+#endif
{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
{Opt_test_dummy_encryption, "test_dummy_encryption=%s"},
{Opt_test_dummy_encryption, "test_dummy_encryption"},
@@ -1739,6 +1938,7 @@ static int clear_qf_name(struct super_block *sb, int qtype)
#define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3)
#define MOPT_STRING 0x0400
#define MOPT_SKIP 0x0800
+#define MOPT_2 0x1000
static const struct mount_opts {
int token;
@@ -1839,6 +2039,13 @@ static const struct mount_opts {
{Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
{Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS,
MOPT_SET},
+ {Opt_no_fc, EXT4_MOUNT2_JOURNAL_FAST_COMMIT,
+ MOPT_CLEAR | MOPT_2 | MOPT_EXT4_ONLY},
+ {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT,
+ MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY},
+#ifdef CONFIG_EXT4_DEBUG
+ {Opt_fc_debug_max_replay, 0, MOPT_GTE0},
+#endif
{Opt_err, 0, 0}
};
@@ -2048,6 +2255,10 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
sbi->s_li_wait_mult = arg;
} else if (token == Opt_max_dir_size_kb) {
sbi->s_max_dir_size_kb = arg;
+#ifdef CONFIG_EXT4_DEBUG
+ } else if (token == Opt_fc_debug_max_replay) {
+ sbi->s_fc_debug_max_replay = arg;
+#endif
} else if (token == Opt_stripe) {
sbi->s_stripe = arg;
} else if (token == Opt_resuid) {
@@ -2216,10 +2427,17 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
WARN_ON(1);
return -1;
}
- if (arg != 0)
- sbi->s_mount_opt |= m->mount_opt;
- else
- sbi->s_mount_opt &= ~m->mount_opt;
+ if (m->flags & MOPT_2) {
+ if (arg != 0)
+ sbi->s_mount_opt2 |= m->mount_opt;
+ else
+ sbi->s_mount_opt2 &= ~m->mount_opt;
+ } else {
+ if (arg != 0)
+ sbi->s_mount_opt |= m->mount_opt;
+ else
+ sbi->s_mount_opt &= ~m->mount_opt;
+ }
}
return 1;
}
@@ -2436,6 +2654,9 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
SEQ_OPTS_PUTS("dax=inode");
}
+ if (test_opt2(sb, JOURNAL_FAST_COMMIT))
+ SEQ_OPTS_PUTS("fast_commit");
+
ext4_show_quota_options(seq, sb);
return 0;
}
@@ -3754,7 +3975,7 @@ int ext4_calculate_overhead(struct super_block *sb)
* Add the internal journal blocks whether the journal has been
* loaded or not
*/
- if (sbi->s_journal && !sbi->journal_bdev)
+ if (sbi->s_journal && !sbi->s_journal_bdev)
overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen);
else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) {
/* j_inum for internal journal is non-zero */
@@ -3868,8 +4089,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
logical_sb_block = sb_block;
}
- if (!(bh = sb_bread_unmovable(sb, logical_sb_block))) {
+ bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
+ if (IS_ERR(bh)) {
ext4_msg(sb, KERN_ERR, "unable to read superblock");
+ ret = PTR_ERR(bh);
+ bh = NULL;
goto out_fail;
}
/*
@@ -3936,6 +4160,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
#ifdef CONFIG_EXT4_FS_POSIX_ACL
set_opt(sb, POSIX_ACL);
#endif
+ if (ext4_has_feature_fast_commit(sb))
+ set_opt2(sb, JOURNAL_FAST_COMMIT);
/* don't forget to enable journal_csum when metadata_csum is enabled. */
if (ext4_has_metadata_csum(sb))
set_opt(sb, JOURNAL_CHECKSUM);
@@ -4265,10 +4491,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
brelse(bh);
logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
offset = do_div(logical_sb_block, blocksize);
- bh = sb_bread_unmovable(sb, logical_sb_block);
- if (!bh) {
+ bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
+ if (IS_ERR(bh)) {
ext4_msg(sb, KERN_ERR,
"Can't read superblock on 2nd try");
+ ret = PTR_ERR(bh);
+ bh = NULL;
goto failed_mount;
}
es = (struct ext4_super_block *)(bh->b_data + offset);
@@ -4480,18 +4708,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
/* Pre-read the descriptors into the buffer cache */
for (i = 0; i < db_count; i++) {
block = descriptor_loc(sb, logical_sb_block, i);
- sb_breadahead_unmovable(sb, block);
+ ext4_sb_breadahead_unmovable(sb, block);
}
for (i = 0; i < db_count; i++) {
struct buffer_head *bh;
block = descriptor_loc(sb, logical_sb_block, i);
- bh = sb_bread_unmovable(sb, block);
- if (!bh) {
+ bh = ext4_sb_bread_unmovable(sb, block);
+ if (IS_ERR(bh)) {
ext4_msg(sb, KERN_ERR,
"can't read group descriptor %d", i);
db_count = i;
+ ret = PTR_ERR(bh);
+ bh = NULL;
goto failed_mount2;
}
rcu_read_lock();
@@ -4539,6 +4769,26 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
mutex_init(&sbi->s_orphan_lock);
+ /* Initialize fast commit stuff */
+ atomic_set(&sbi->s_fc_subtid, 0);
+ atomic_set(&sbi->s_fc_ineligible_updates, 0);
+ INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]);
+ INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]);
+ INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]);
+ INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]);
+ sbi->s_fc_bytes = 0;
+ sbi->s_mount_state &= ~EXT4_FC_INELIGIBLE;
+ sbi->s_mount_state &= ~EXT4_FC_COMMITTING;
+ spin_lock_init(&sbi->s_fc_lock);
+ memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
+ sbi->s_fc_replay_state.fc_regions = NULL;
+ sbi->s_fc_replay_state.fc_regions_size = 0;
+ sbi->s_fc_replay_state.fc_regions_used = 0;
+ sbi->s_fc_replay_state.fc_regions_valid = 0;
+ sbi->s_fc_replay_state.fc_modified_inodes = NULL;
+ sbi->s_fc_replay_state.fc_modified_inodes_size = 0;
+ sbi->s_fc_replay_state.fc_modified_inodes_used = 0;
+
sb->s_root = NULL;
needs_recovery = (es->s_last_orphan != 0 ||
@@ -4588,6 +4838,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
clear_opt(sb, JOURNAL_CHECKSUM);
clear_opt(sb, DATA_FLAGS);
+ clear_opt2(sb, JOURNAL_FAST_COMMIT);
sbi->s_journal = NULL;
needs_recovery = 0;
goto no_journal;
@@ -4646,6 +4897,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
+ sbi->s_journal->j_submit_inode_data_buffers =
+ ext4_journal_submit_inode_data_buffers;
+ sbi->s_journal->j_finish_inode_data_buffers =
+ ext4_journal_finish_inode_data_buffers;
no_journal:
if (!test_opt(sb, NO_MBCACHE)) {
@@ -4748,6 +5003,7 @@ no_journal:
goto failed_mount4a;
}
}
+ ext4_fc_replay_cleanup(sb);
ext4_ext_init(sb);
err = ext4_mb_init(sb);
@@ -4814,9 +5070,8 @@ no_journal:
* used to detect the metadata async write error.
*/
spin_lock_init(&sbi->s_bdev_wb_lock);
- if (!sb_rdonly(sb))
- errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err,
- &sbi->s_bdev_wb_err);
+ errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err,
+ &sbi->s_bdev_wb_err);
sb->s_bdev->bd_super = sb;
EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
ext4_orphan_cleanup(sb, es);
@@ -4872,6 +5127,7 @@ cantfind_ext4:
failed_mount8:
ext4_unregister_sysfs(sb);
+ kobject_put(&sbi->s_kobj);
failed_mount7:
ext4_unregister_li_request(sb);
failed_mount6:
@@ -4960,6 +5216,7 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
journal->j_commit_interval = sbi->s_commit_interval;
journal->j_min_batch_time = sbi->s_min_batch_time;
journal->j_max_batch_time = sbi->s_max_batch_time;
+ ext4_fc_init(sb, journal);
write_lock(&journal->j_state_lock);
if (test_opt(sb, BARRIER))
@@ -5102,9 +5359,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
goto out_bdev;
}
journal->j_private = sb;
- ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer);
- wait_on_buffer(journal->j_sb_buffer);
- if (!buffer_uptodate(journal->j_sb_buffer)) {
+ if (ext4_read_bh_lock(journal->j_sb_buffer, REQ_META | REQ_PRIO, true)) {
ext4_msg(sb, KERN_ERR, "I/O error on journal device");
goto out_journal;
}
@@ -5114,7 +5369,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
be32_to_cpu(journal->j_superblock->s_nr_users));
goto out_journal;
}
- EXT4_SB(sb)->journal_bdev = bdev;
+ EXT4_SB(sb)->s_journal_bdev = bdev;
ext4_init_journal_params(sb, journal);
return journal;
@@ -5708,14 +5963,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
}
/*
- * Update the original bdev mapping's wb_err value
- * which could be used to detect the metadata async
- * write error.
- */
- errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err,
- &sbi->s_bdev_wb_err);
-
- /*
* Mounting a RDONLY partition read-write, so reread
* and store the current valid flag. (It may have
* been changed by e2fsck since we originally mounted
@@ -5760,7 +6007,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
* Releasing of existing data is done when we are sure remount will
* succeed.
*/
- if (test_opt(sb, BLOCK_VALIDITY) && !sbi->system_blks) {
+ if (test_opt(sb, BLOCK_VALIDITY) && !sbi->s_system_blks) {
err = ext4_setup_system_zone(sb);
if (err)
goto restore_opts;
@@ -5786,7 +6033,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
}
}
#endif
- if (!test_opt(sb, BLOCK_VALIDITY) && sbi->system_blks)
+ if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
ext4_release_system_zone(sb);
/*
@@ -5809,7 +6056,7 @@ restore_opts:
sbi->s_commit_interval = old_opts.s_commit_interval;
sbi->s_min_batch_time = old_opts.s_min_batch_time;
sbi->s_max_batch_time = old_opts.s_max_batch_time;
- if (!test_opt(sb, BLOCK_VALIDITY) && sbi->system_blks)
+ if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
ext4_release_system_zone(sb);
#ifdef CONFIG_QUOTA
sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
@@ -5897,8 +6144,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_namelen = EXT4_NAME_LEN;
fsid = le64_to_cpup((void *)es->s_uuid) ^
le64_to_cpup((void *)es->s_uuid + sizeof(u64));
- buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
- buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+ buf->f_fsid = u64_to_fsid(fsid);
#ifdef CONFIG_QUOTA
if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) &&
@@ -6042,6 +6288,11 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
/* Quotafile not on the same filesystem? */
if (path->dentry->d_sb != sb)
return -EXDEV;
+
+ /* Quota already enabled for this file? */
+ if (IS_NOQUOTA(d_inode(path->dentry)))
+ return -EBUSY;
+
/* Journaling quota? */
if (EXT4_SB(sb)->s_qf_names[type]) {
/* Quotafile not in fs root? */
@@ -6309,6 +6560,10 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
brelse(bh);
out:
if (inode->i_size < off + len) {
+ ext4_fc_track_range(inode,
+ (inode->i_size > 0 ? inode->i_size - 1 : 0)
+ >> inode->i_sb->s_blocksize_bits,
+ (off + len) >> inode->i_sb->s_blocksize_bits);
i_size_write(inode, off + len);
EXT4_I(inode)->i_disksize = inode->i_size;
err2 = ext4_mark_inode_dirty(handle, inode);
@@ -6437,6 +6692,11 @@ static int __init ext4_init_fs(void)
err = init_inodecache();
if (err)
goto out1;
+
+ err = ext4_fc_init_dentry_cache();
+ if (err)
+ goto out05;
+
register_as_ext3();
register_as_ext2();
err = register_filesystem(&ext4_fs_type);
@@ -6447,6 +6707,7 @@ static int __init ext4_init_fs(void)
out:
unregister_as_ext2();
unregister_as_ext3();
+out05:
destroy_inodecache();
out1:
ext4_exit_mballoc();
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index bfabb799fa45..5ff33d18996a 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -521,6 +521,8 @@ int ext4_register_sysfs(struct super_block *sb)
proc_create_single_data("es_shrinker_info", S_IRUGO,
sbi->s_proc, ext4_seq_es_shrinker_info_show,
sb);
+ proc_create_single_data("fc_info", 0444, sbi->s_proc,
+ ext4_fc_info_show, sb);
proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc,
&ext4_mb_seq_groups_ops, sb);
}
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index cba4b877c606..6127e94ea4f5 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -2419,6 +2419,7 @@ retry_inode:
if (IS_SYNC(inode))
ext4_handle_sync(handle);
}
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR);
cleanup:
brelse(is.iloc.bh);
@@ -2496,6 +2497,7 @@ retry:
if (error == 0)
error = error2;
}
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR);
return error;
}
@@ -2928,6 +2930,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
error);
goto cleanup;
}
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR);
}
error = 0;
cleanup:
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 0c958fed3392..00eff2f51807 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1442,8 +1442,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
}
buf->f_namelen = F2FS_NAME_LEN;
- buf->f_fsid.val[0] = (u32)id;
- buf->f_fsid.val[1] = (u32)(id >> 32);
+ buf->f_fsid = u64_to_fsid(id);
#ifdef CONFIG_QUOTA
if (is_inode_flag_set(dentry->d_inode, FI_PROJ_INHERIT) &&
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index a0cf99debb1e..bab9b202b496 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -836,8 +836,7 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_blocks = sbi->max_cluster - FAT_START_ENT;
buf->f_bfree = sbi->free_clusters;
buf->f_bavail = sbi->free_clusters;
- buf->f_fsid.val[0] = (u32)id;
- buf->f_fsid.val[1] = (u32)(id >> 32);
+ buf->f_fsid = u64_to_fsid(id);
buf->f_namelen =
(sbi->options.isvfat ? FAT_LFN_LEN : 12) * NLS_MAX_CHARSET_SIZE;
diff --git a/fs/file_table.c b/fs/file_table.c
index 656647f9575a..709ada3151da 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -339,7 +339,7 @@ void fput_many(struct file *file, unsigned int refs)
if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
init_task_work(&file->f_u.fu_rcuhead, ____fput);
- if (!task_work_add(task, &file->f_u.fu_rcuhead, true))
+ if (!task_work_add(task, &file->f_u.fu_rcuhead, TWA_RESUME))
return;
/*
* After this task has run exit_task_work(),
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index 774b2618018a..40ce9a1c12e5 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -8,7 +8,7 @@ config FUSE_FS
There's also a companion library: libfuse2. This library is available
from the FUSE homepage:
- <http://fuse.sourceforge.net/>
+ <https://github.com/libfuse/>
although chances are your distribution already has that library
installed if you've installed the "fuse" package itself.
@@ -38,3 +38,17 @@ config VIRTIO_FS
If you want to share files between guests or with the host, answer Y
or M.
+
+config FUSE_DAX
+ bool "Virtio Filesystem Direct Host Memory Access support"
+ default y
+ select INTERVAL_TREE
+ depends on VIRTIO_FS
+ depends on FS_DAX
+ depends on DAX_DRIVER
+ help
+ This allows bypassing guest page cache and allows mapping host page
+ cache directly in guest address space.
+
+ If you want to allow mounting a Virtio Filesystem with the "dax"
+ option, answer Y.
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 3e8cebfb59b7..8c7021fb2cd4 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -7,5 +7,7 @@ obj-$(CONFIG_FUSE_FS) += fuse.o
obj-$(CONFIG_CUSE) += cuse.o
obj-$(CONFIG_VIRTIO_FS) += virtiofs.o
-fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o
-virtiofs-y += virtio_fs.o
+fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o
+fuse-$(CONFIG_FUSE_DAX) += dax.o
+
+virtiofs-y := virtio_fs.o
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index a1303ad303ba..cc7e94d73c6c 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -164,6 +164,7 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
{
unsigned val;
struct fuse_conn *fc;
+ struct fuse_mount *fm;
ssize_t ret;
ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
@@ -174,18 +175,27 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
if (!fc)
goto out;
+ down_read(&fc->killsb);
spin_lock(&fc->bg_lock);
fc->congestion_threshold = val;
- if (fc->sb) {
+
+ /*
+ * Get any fuse_mount belonging to this fuse_conn; s_bdi is
+ * shared between all of them
+ */
+
+ if (!list_empty(&fc->mounts)) {
+ fm = list_first_entry(&fc->mounts, struct fuse_mount, fc_entry);
if (fc->num_background < fc->congestion_threshold) {
- clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
- clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
+ clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
+ clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
} else {
- set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
- set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
+ set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
+ set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
}
}
spin_unlock(&fc->bg_lock);
+ up_read(&fc->killsb);
fuse_conn_put(fc);
out:
return ret;
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 2cc17816d7b1..45082269e698 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -57,6 +57,7 @@
struct cuse_conn {
struct list_head list; /* linked on cuse_conntbl */
+ struct fuse_mount fm; /* Dummy mount referencing fc */
struct fuse_conn fc; /* fuse connection */
struct cdev *cdev; /* associated character device */
struct device *dev; /* device representing @cdev */
@@ -134,7 +135,7 @@ static int cuse_open(struct inode *inode, struct file *file)
* Generic permission check is already done against the chrdev
* file, proceed to open.
*/
- rc = fuse_do_open(&cc->fc, 0, file, 0);
+ rc = fuse_do_open(&cc->fm, 0, file, 0);
if (rc)
fuse_conn_put(&cc->fc);
return rc;
@@ -143,10 +144,10 @@ static int cuse_open(struct inode *inode, struct file *file)
static int cuse_release(struct inode *inode, struct file *file)
{
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = ff->fc;
+ struct fuse_mount *fm = ff->fm;
fuse_sync_release(NULL, ff, file->f_flags);
- fuse_conn_put(fc);
+ fuse_conn_put(fm->fc);
return 0;
}
@@ -155,7 +156,7 @@ static long cuse_file_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
struct fuse_file *ff = file->private_data;
- struct cuse_conn *cc = fc_to_cc(ff->fc);
+ struct cuse_conn *cc = fc_to_cc(ff->fm->fc);
unsigned int flags = 0;
if (cc->unrestricted_ioctl)
@@ -168,7 +169,7 @@ static long cuse_file_compat_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
struct fuse_file *ff = file->private_data;
- struct cuse_conn *cc = fc_to_cc(ff->fc);
+ struct cuse_conn *cc = fc_to_cc(ff->fm->fc);
unsigned int flags = FUSE_IOCTL_COMPAT;
if (cc->unrestricted_ioctl)
@@ -313,9 +314,10 @@ struct cuse_init_args {
* required data structures for it. Please read the comment at the
* top of this file for high level overview.
*/
-static void cuse_process_init_reply(struct fuse_conn *fc,
+static void cuse_process_init_reply(struct fuse_mount *fm,
struct fuse_args *args, int error)
{
+ struct fuse_conn *fc = fm->fc;
struct cuse_init_args *ia = container_of(args, typeof(*ia), ap.args);
struct fuse_args_pages *ap = &ia->ap;
struct cuse_conn *cc = fc_to_cc(fc), *pos;
@@ -424,7 +426,7 @@ static int cuse_send_init(struct cuse_conn *cc)
{
int rc;
struct page *page;
- struct fuse_conn *fc = &cc->fc;
+ struct fuse_mount *fm = &cc->fm;
struct cuse_init_args *ia;
struct fuse_args_pages *ap;
@@ -460,7 +462,7 @@ static int cuse_send_init(struct cuse_conn *cc)
ia->desc.length = ap->args.out_args[1].size;
ap->args.end = cuse_process_init_reply;
- rc = fuse_simple_background(fc, &ap->args, GFP_KERNEL);
+ rc = fuse_simple_background(fm, &ap->args, GFP_KERNEL);
if (rc) {
kfree(ia);
err_free_page:
@@ -506,7 +508,8 @@ static int cuse_channel_open(struct inode *inode, struct file *file)
* Limit the cuse channel to requests that can
* be represented in file->f_cred->user_ns.
*/
- fuse_conn_init(&cc->fc, file->f_cred->user_ns, &fuse_dev_fiq_ops, NULL);
+ fuse_conn_init(&cc->fc, &cc->fm, file->f_cred->user_ns,
+ &fuse_dev_fiq_ops, NULL);
fud = fuse_dev_alloc_install(&cc->fc);
if (!fud) {
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
new file mode 100644
index 000000000000..ff99ab2a3c43
--- /dev/null
+++ b/fs/fuse/dax.c
@@ -0,0 +1,1365 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * dax: direct host memory access
+ * Copyright (C) 2020 Red Hat, Inc.
+ */
+
+#include "fuse_i.h"
+
+#include <linux/delay.h>
+#include <linux/dax.h>
+#include <linux/uio.h>
+#include <linux/pfn_t.h>
+#include <linux/iomap.h>
+#include <linux/interval_tree.h>
+
+/*
+ * Default memory range size. A power of 2 so it agrees with common FUSE_INIT
+ * map_alignment values 4KB and 64KB.
+ */
+#define FUSE_DAX_SHIFT 21
+#define FUSE_DAX_SZ (1 << FUSE_DAX_SHIFT)
+#define FUSE_DAX_PAGES (FUSE_DAX_SZ / PAGE_SIZE)
+
+/* Number of ranges reclaimer will try to free in one invocation */
+#define FUSE_DAX_RECLAIM_CHUNK (10)
+
+/*
+ * Dax memory reclaim threshold in percetage of total ranges. When free
+ * number of free ranges drops below this threshold, reclaim can trigger
+ * Default is 20%
+ */
+#define FUSE_DAX_RECLAIM_THRESHOLD (20)
+
+/** Translation information for file offsets to DAX window offsets */
+struct fuse_dax_mapping {
+ /* Pointer to inode where this memory range is mapped */
+ struct inode *inode;
+
+ /* Will connect in fcd->free_ranges to keep track of free memory */
+ struct list_head list;
+
+ /* For interval tree in file/inode */
+ struct interval_tree_node itn;
+
+ /* Will connect in fc->busy_ranges to keep track busy memory */
+ struct list_head busy_list;
+
+ /** Position in DAX window */
+ u64 window_offset;
+
+ /** Length of mapping, in bytes */
+ loff_t length;
+
+ /* Is this mapping read-only or read-write */
+ bool writable;
+
+ /* reference count when the mapping is used by dax iomap. */
+ refcount_t refcnt;
+};
+
+/* Per-inode dax map */
+struct fuse_inode_dax {
+ /* Semaphore to protect modifications to the dmap tree */
+ struct rw_semaphore sem;
+
+ /* Sorted rb tree of struct fuse_dax_mapping elements */
+ struct rb_root_cached tree;
+ unsigned long nr;
+};
+
+struct fuse_conn_dax {
+ /* DAX device */
+ struct dax_device *dev;
+
+ /* Lock protecting accessess to members of this structure */
+ spinlock_t lock;
+
+ /* List of memory ranges which are busy */
+ unsigned long nr_busy_ranges;
+ struct list_head busy_ranges;
+
+ /* Worker to free up memory ranges */
+ struct delayed_work free_work;
+
+ /* Wait queue for a dax range to become free */
+ wait_queue_head_t range_waitq;
+
+ /* DAX Window Free Ranges */
+ long nr_free_ranges;
+ struct list_head free_ranges;
+
+ unsigned long nr_ranges;
+};
+
+static inline struct fuse_dax_mapping *
+node_to_dmap(struct interval_tree_node *node)
+{
+ if (!node)
+ return NULL;
+
+ return container_of(node, struct fuse_dax_mapping, itn);
+}
+
+static struct fuse_dax_mapping *
+alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode);
+
+static void
+__kick_dmap_free_worker(struct fuse_conn_dax *fcd, unsigned long delay_ms)
+{
+ unsigned long free_threshold;
+
+ /* If number of free ranges are below threshold, start reclaim */
+ free_threshold = max_t(unsigned long, fcd->nr_ranges * FUSE_DAX_RECLAIM_THRESHOLD / 100,
+ 1);
+ if (fcd->nr_free_ranges < free_threshold)
+ queue_delayed_work(system_long_wq, &fcd->free_work,
+ msecs_to_jiffies(delay_ms));
+}
+
+static void kick_dmap_free_worker(struct fuse_conn_dax *fcd,
+ unsigned long delay_ms)
+{
+ spin_lock(&fcd->lock);
+ __kick_dmap_free_worker(fcd, delay_ms);
+ spin_unlock(&fcd->lock);
+}
+
+static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn_dax *fcd)
+{
+ struct fuse_dax_mapping *dmap;
+
+ spin_lock(&fcd->lock);
+ dmap = list_first_entry_or_null(&fcd->free_ranges,
+ struct fuse_dax_mapping, list);
+ if (dmap) {
+ list_del_init(&dmap->list);
+ WARN_ON(fcd->nr_free_ranges <= 0);
+ fcd->nr_free_ranges--;
+ }
+ spin_unlock(&fcd->lock);
+
+ kick_dmap_free_worker(fcd, 0);
+ return dmap;
+}
+
+/* This assumes fcd->lock is held */
+static void __dmap_remove_busy_list(struct fuse_conn_dax *fcd,
+ struct fuse_dax_mapping *dmap)
+{
+ list_del_init(&dmap->busy_list);
+ WARN_ON(fcd->nr_busy_ranges == 0);
+ fcd->nr_busy_ranges--;
+}
+
+static void dmap_remove_busy_list(struct fuse_conn_dax *fcd,
+ struct fuse_dax_mapping *dmap)
+{
+ spin_lock(&fcd->lock);
+ __dmap_remove_busy_list(fcd, dmap);
+ spin_unlock(&fcd->lock);
+}
+
+/* This assumes fcd->lock is held */
+static void __dmap_add_to_free_pool(struct fuse_conn_dax *fcd,
+ struct fuse_dax_mapping *dmap)
+{
+ list_add_tail(&dmap->list, &fcd->free_ranges);
+ fcd->nr_free_ranges++;
+ wake_up(&fcd->range_waitq);
+}
+
+static void dmap_add_to_free_pool(struct fuse_conn_dax *fcd,
+ struct fuse_dax_mapping *dmap)
+{
+ /* Return fuse_dax_mapping to free list */
+ spin_lock(&fcd->lock);
+ __dmap_add_to_free_pool(fcd, dmap);
+ spin_unlock(&fcd->lock);
+}
+
+static int fuse_setup_one_mapping(struct inode *inode, unsigned long start_idx,
+ struct fuse_dax_mapping *dmap, bool writable,
+ bool upgrade)
+{
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ struct fuse_conn_dax *fcd = fm->fc->dax;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_setupmapping_in inarg;
+ loff_t offset = start_idx << FUSE_DAX_SHIFT;
+ FUSE_ARGS(args);
+ ssize_t err;
+
+ WARN_ON(fcd->nr_free_ranges < 0);
+
+ /* Ask fuse daemon to setup mapping */
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.foffset = offset;
+ inarg.fh = -1;
+ inarg.moffset = dmap->window_offset;
+ inarg.len = FUSE_DAX_SZ;
+ inarg.flags |= FUSE_SETUPMAPPING_FLAG_READ;
+ if (writable)
+ inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE;
+ args.opcode = FUSE_SETUPMAPPING;
+ args.nodeid = fi->nodeid;
+ args.in_numargs = 1;
+ args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].value = &inarg;
+ err = fuse_simple_request(fm, &args);
+ if (err < 0)
+ return err;
+ dmap->writable = writable;
+ if (!upgrade) {
+ /*
+ * We don't take a refernce on inode. inode is valid right now
+ * and when inode is going away, cleanup logic should first
+ * cleanup dmap entries.
+ */
+ dmap->inode = inode;
+ dmap->itn.start = dmap->itn.last = start_idx;
+ /* Protected by fi->dax->sem */
+ interval_tree_insert(&dmap->itn, &fi->dax->tree);
+ fi->dax->nr++;
+ spin_lock(&fcd->lock);
+ list_add_tail(&dmap->busy_list, &fcd->busy_ranges);
+ fcd->nr_busy_ranges++;
+ spin_unlock(&fcd->lock);
+ }
+ return 0;
+}
+
+static int fuse_send_removemapping(struct inode *inode,
+ struct fuse_removemapping_in *inargp,
+ struct fuse_removemapping_one *remove_one)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ FUSE_ARGS(args);
+
+ args.opcode = FUSE_REMOVEMAPPING;
+ args.nodeid = fi->nodeid;
+ args.in_numargs = 2;
+ args.in_args[0].size = sizeof(*inargp);
+ args.in_args[0].value = inargp;
+ args.in_args[1].size = inargp->count * sizeof(*remove_one);
+ args.in_args[1].value = remove_one;
+ return fuse_simple_request(fm, &args);
+}
+
+static int dmap_removemapping_list(struct inode *inode, unsigned int num,
+ struct list_head *to_remove)
+{
+ struct fuse_removemapping_one *remove_one, *ptr;
+ struct fuse_removemapping_in inarg;
+ struct fuse_dax_mapping *dmap;
+ int ret, i = 0, nr_alloc;
+
+ nr_alloc = min_t(unsigned int, num, FUSE_REMOVEMAPPING_MAX_ENTRY);
+ remove_one = kmalloc_array(nr_alloc, sizeof(*remove_one), GFP_NOFS);
+ if (!remove_one)
+ return -ENOMEM;
+
+ ptr = remove_one;
+ list_for_each_entry(dmap, to_remove, list) {
+ ptr->moffset = dmap->window_offset;
+ ptr->len = dmap->length;
+ ptr++;
+ i++;
+ num--;
+ if (i >= nr_alloc || num == 0) {
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.count = i;
+ ret = fuse_send_removemapping(inode, &inarg,
+ remove_one);
+ if (ret)
+ goto out;
+ ptr = remove_one;
+ i = 0;
+ }
+ }
+out:
+ kfree(remove_one);
+ return ret;
+}
+
+/*
+ * Cleanup dmap entry and add back to free list. This should be called with
+ * fcd->lock held.
+ */
+static void dmap_reinit_add_to_free_pool(struct fuse_conn_dax *fcd,
+ struct fuse_dax_mapping *dmap)
+{
+ pr_debug("fuse: freeing memory range start_idx=0x%lx end_idx=0x%lx window_offset=0x%llx length=0x%llx\n",
+ dmap->itn.start, dmap->itn.last, dmap->window_offset,
+ dmap->length);
+ __dmap_remove_busy_list(fcd, dmap);
+ dmap->inode = NULL;
+ dmap->itn.start = dmap->itn.last = 0;
+ __dmap_add_to_free_pool(fcd, dmap);
+}
+
+/*
+ * Free inode dmap entries whose range falls inside [start, end].
+ * Does not take any locks. At this point of time it should only be
+ * called from evict_inode() path where we know all dmap entries can be
+ * reclaimed.
+ */
+static void inode_reclaim_dmap_range(struct fuse_conn_dax *fcd,
+ struct inode *inode,
+ loff_t start, loff_t end)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_dax_mapping *dmap, *n;
+ int err, num = 0;
+ LIST_HEAD(to_remove);
+ unsigned long start_idx = start >> FUSE_DAX_SHIFT;
+ unsigned long end_idx = end >> FUSE_DAX_SHIFT;
+ struct interval_tree_node *node;
+
+ while (1) {
+ node = interval_tree_iter_first(&fi->dax->tree, start_idx,
+ end_idx);
+ if (!node)
+ break;
+ dmap = node_to_dmap(node);
+ /* inode is going away. There should not be any users of dmap */
+ WARN_ON(refcount_read(&dmap->refcnt) > 1);
+ interval_tree_remove(&dmap->itn, &fi->dax->tree);
+ num++;
+ list_add(&dmap->list, &to_remove);
+ }
+
+ /* Nothing to remove */
+ if (list_empty(&to_remove))
+ return;
+
+ WARN_ON(fi->dax->nr < num);
+ fi->dax->nr -= num;
+ err = dmap_removemapping_list(inode, num, &to_remove);
+ if (err && err != -ENOTCONN) {
+ pr_warn("Failed to removemappings. start=0x%llx end=0x%llx\n",
+ start, end);
+ }
+ spin_lock(&fcd->lock);
+ list_for_each_entry_safe(dmap, n, &to_remove, list) {
+ list_del_init(&dmap->list);
+ dmap_reinit_add_to_free_pool(fcd, dmap);
+ }
+ spin_unlock(&fcd->lock);
+}
+
+static int dmap_removemapping_one(struct inode *inode,
+ struct fuse_dax_mapping *dmap)
+{
+ struct fuse_removemapping_one forget_one;
+ struct fuse_removemapping_in inarg;
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.count = 1;
+ memset(&forget_one, 0, sizeof(forget_one));
+ forget_one.moffset = dmap->window_offset;
+ forget_one.len = dmap->length;
+
+ return fuse_send_removemapping(inode, &inarg, &forget_one);
+}
+
+/*
+ * It is called from evict_inode() and by that time inode is going away. So
+ * this function does not take any locks like fi->dax->sem for traversing
+ * that fuse inode interval tree. If that lock is taken then lock validator
+ * complains of deadlock situation w.r.t fs_reclaim lock.
+ */
+void fuse_dax_inode_cleanup(struct inode *inode)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ /*
+ * fuse_evict_inode() has already called truncate_inode_pages_final()
+ * before we arrive here. So we should not have to worry about any
+ * pages/exception entries still associated with inode.
+ */
+ inode_reclaim_dmap_range(fc->dax, inode, 0, -1);
+ WARN_ON(fi->dax->nr);
+}
+
+static void fuse_fill_iomap_hole(struct iomap *iomap, loff_t length)
+{
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->length = length;
+ iomap->type = IOMAP_HOLE;
+}
+
+static void fuse_fill_iomap(struct inode *inode, loff_t pos, loff_t length,
+ struct iomap *iomap, struct fuse_dax_mapping *dmap,
+ unsigned int flags)
+{
+ loff_t offset, len;
+ loff_t i_size = i_size_read(inode);
+
+ offset = pos - (dmap->itn.start << FUSE_DAX_SHIFT);
+ len = min(length, dmap->length - offset);
+
+ /* If length is beyond end of file, truncate further */
+ if (pos + len > i_size)
+ len = i_size - pos;
+
+ if (len > 0) {
+ iomap->addr = dmap->window_offset + offset;
+ iomap->length = len;
+ if (flags & IOMAP_FAULT)
+ iomap->length = ALIGN(len, PAGE_SIZE);
+ iomap->type = IOMAP_MAPPED;
+ /*
+ * increace refcnt so that reclaim code knows this dmap is in
+ * use. This assumes fi->dax->sem mutex is held either
+ * shared/exclusive.
+ */
+ refcount_inc(&dmap->refcnt);
+
+ /* iomap->private should be NULL */
+ WARN_ON_ONCE(iomap->private);
+ iomap->private = dmap;
+ } else {
+ /* Mapping beyond end of file is hole */
+ fuse_fill_iomap_hole(iomap, length);
+ }
+}
+
+static int fuse_setup_new_dax_mapping(struct inode *inode, loff_t pos,
+ loff_t length, unsigned int flags,
+ struct iomap *iomap)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_conn_dax *fcd = fc->dax;
+ struct fuse_dax_mapping *dmap, *alloc_dmap = NULL;
+ int ret;
+ bool writable = flags & IOMAP_WRITE;
+ unsigned long start_idx = pos >> FUSE_DAX_SHIFT;
+ struct interval_tree_node *node;
+
+ /*
+ * Can't do inline reclaim in fault path. We call
+ * dax_layout_busy_page() before we free a range. And
+ * fuse_wait_dax_page() drops fi->i_mmap_sem lock and requires it.
+ * In fault path we enter with fi->i_mmap_sem held and can't drop
+ * it. Also in fault path we hold fi->i_mmap_sem shared and not
+ * exclusive, so that creates further issues with fuse_wait_dax_page().
+ * Hence return -EAGAIN and fuse_dax_fault() will wait for a memory
+ * range to become free and retry.
+ */
+ if (flags & IOMAP_FAULT) {
+ alloc_dmap = alloc_dax_mapping(fcd);
+ if (!alloc_dmap)
+ return -EAGAIN;
+ } else {
+ alloc_dmap = alloc_dax_mapping_reclaim(fcd, inode);
+ if (IS_ERR(alloc_dmap))
+ return PTR_ERR(alloc_dmap);
+ }
+
+ /* If we are here, we should have memory allocated */
+ if (WARN_ON(!alloc_dmap))
+ return -EIO;
+
+ /*
+ * Take write lock so that only one caller can try to setup mapping
+ * and other waits.
+ */
+ down_write(&fi->dax->sem);
+ /*
+ * We dropped lock. Check again if somebody else setup
+ * mapping already.
+ */
+ node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx);
+ if (node) {
+ dmap = node_to_dmap(node);
+ fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
+ dmap_add_to_free_pool(fcd, alloc_dmap);
+ up_write(&fi->dax->sem);
+ return 0;
+ }
+
+ /* Setup one mapping */
+ ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, alloc_dmap,
+ writable, false);
+ if (ret < 0) {
+ dmap_add_to_free_pool(fcd, alloc_dmap);
+ up_write(&fi->dax->sem);
+ return ret;
+ }
+ fuse_fill_iomap(inode, pos, length, iomap, alloc_dmap, flags);
+ up_write(&fi->dax->sem);
+ return 0;
+}
+
+static int fuse_upgrade_dax_mapping(struct inode *inode, loff_t pos,
+ loff_t length, unsigned int flags,
+ struct iomap *iomap)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_dax_mapping *dmap;
+ int ret;
+ unsigned long idx = pos >> FUSE_DAX_SHIFT;
+ struct interval_tree_node *node;
+
+ /*
+ * Take exclusive lock so that only one caller can try to setup
+ * mapping and others wait.
+ */
+ down_write(&fi->dax->sem);
+ node = interval_tree_iter_first(&fi->dax->tree, idx, idx);
+
+ /* We are holding either inode lock or i_mmap_sem, and that should
+ * ensure that dmap can't be truncated. We are holding a reference
+ * on dmap and that should make sure it can't be reclaimed. So dmap
+ * should still be there in tree despite the fact we dropped and
+ * re-acquired the fi->dax->sem lock.
+ */
+ ret = -EIO;
+ if (WARN_ON(!node))
+ goto out_err;
+
+ dmap = node_to_dmap(node);
+
+ /* We took an extra reference on dmap to make sure its not reclaimd.
+ * Now we hold fi->dax->sem lock and that reference is not needed
+ * anymore. Drop it.
+ */
+ if (refcount_dec_and_test(&dmap->refcnt)) {
+ /* refcount should not hit 0. This object only goes
+ * away when fuse connection goes away
+ */
+ WARN_ON_ONCE(1);
+ }
+
+ /* Maybe another thread already upgraded mapping while we were not
+ * holding lock.
+ */
+ if (dmap->writable) {
+ ret = 0;
+ goto out_fill_iomap;
+ }
+
+ ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, dmap, true,
+ true);
+ if (ret < 0)
+ goto out_err;
+out_fill_iomap:
+ fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
+out_err:
+ up_write(&fi->dax->sem);
+ return ret;
+}
+
+/* This is just for DAX and the mapping is ephemeral, do not use it for other
+ * purposes since there is no block device with a permanent mapping.
+ */
+static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
+ unsigned int flags, struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_dax_mapping *dmap;
+ bool writable = flags & IOMAP_WRITE;
+ unsigned long start_idx = pos >> FUSE_DAX_SHIFT;
+ struct interval_tree_node *node;
+
+ /* We don't support FIEMAP */
+ if (WARN_ON(flags & IOMAP_REPORT))
+ return -EIO;
+
+ iomap->offset = pos;
+ iomap->flags = 0;
+ iomap->bdev = NULL;
+ iomap->dax_dev = fc->dax->dev;
+
+ /*
+ * Both read/write and mmap path can race here. So we need something
+ * to make sure if we are setting up mapping, then other path waits
+ *
+ * For now, use a semaphore for this. It probably needs to be
+ * optimized later.
+ */
+ down_read(&fi->dax->sem);
+ node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx);
+ if (node) {
+ dmap = node_to_dmap(node);
+ if (writable && !dmap->writable) {
+ /* Upgrade read-only mapping to read-write. This will
+ * require exclusive fi->dax->sem lock as we don't want
+ * two threads to be trying to this simultaneously
+ * for same dmap. So drop shared lock and acquire
+ * exclusive lock.
+ *
+ * Before dropping fi->dax->sem lock, take reference
+ * on dmap so that its not freed by range reclaim.
+ */
+ refcount_inc(&dmap->refcnt);
+ up_read(&fi->dax->sem);
+ pr_debug("%s: Upgrading mapping at offset 0x%llx length 0x%llx\n",
+ __func__, pos, length);
+ return fuse_upgrade_dax_mapping(inode, pos, length,
+ flags, iomap);
+ } else {
+ fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
+ up_read(&fi->dax->sem);
+ return 0;
+ }
+ } else {
+ up_read(&fi->dax->sem);
+ pr_debug("%s: no mapping at offset 0x%llx length 0x%llx\n",
+ __func__, pos, length);
+ if (pos >= i_size_read(inode))
+ goto iomap_hole;
+
+ return fuse_setup_new_dax_mapping(inode, pos, length, flags,
+ iomap);
+ }
+
+ /*
+ * If read beyond end of file happnes, fs code seems to return
+ * it as hole
+ */
+iomap_hole:
+ fuse_fill_iomap_hole(iomap, length);
+ pr_debug("%s returning hole mapping. pos=0x%llx length_asked=0x%llx length_returned=0x%llx\n",
+ __func__, pos, length, iomap->length);
+ return 0;
+}
+
+static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+ ssize_t written, unsigned int flags,
+ struct iomap *iomap)
+{
+ struct fuse_dax_mapping *dmap = iomap->private;
+
+ if (dmap) {
+ if (refcount_dec_and_test(&dmap->refcnt)) {
+ /* refcount should not hit 0. This object only goes
+ * away when fuse connection goes away
+ */
+ WARN_ON_ONCE(1);
+ }
+ }
+
+ /* DAX writes beyond end-of-file aren't handled using iomap, so the
+ * file size is unchanged and there is nothing to do here.
+ */
+ return 0;
+}
+
+static const struct iomap_ops fuse_iomap_ops = {
+ .iomap_begin = fuse_iomap_begin,
+ .iomap_end = fuse_iomap_end,
+};
+
+static void fuse_wait_dax_page(struct inode *inode)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ up_write(&fi->i_mmap_sem);
+ schedule();
+ down_write(&fi->i_mmap_sem);
+}
+
+/* Should be called with fi->i_mmap_sem lock held exclusively */
+static int __fuse_dax_break_layouts(struct inode *inode, bool *retry,
+ loff_t start, loff_t end)
+{
+ struct page *page;
+
+ page = dax_layout_busy_page_range(inode->i_mapping, start, end);
+ if (!page)
+ return 0;
+
+ *retry = true;
+ return ___wait_var_event(&page->_refcount,
+ atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
+ 0, 0, fuse_wait_dax_page(inode));
+}
+
+/* dmap_end == 0 leads to unmapping of whole file */
+int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start,
+ u64 dmap_end)
+{
+ bool retry;
+ int ret;
+
+ do {
+ retry = false;
+ ret = __fuse_dax_break_layouts(inode, &retry, dmap_start,
+ dmap_end);
+ } while (ret == 0 && retry);
+
+ return ret;
+}
+
+ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ ssize_t ret;
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!inode_trylock_shared(inode))
+ return -EAGAIN;
+ } else {
+ inode_lock_shared(inode);
+ }
+
+ ret = dax_iomap_rw(iocb, to, &fuse_iomap_ops);
+ inode_unlock_shared(inode);
+
+ /* TODO file_accessed(iocb->f_filp) */
+ return ret;
+}
+
+static bool file_extending_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ return (iov_iter_rw(from) == WRITE &&
+ ((iocb->ki_pos) >= i_size_read(inode) ||
+ (iocb->ki_pos + iov_iter_count(from) > i_size_read(inode))));
+}
+
+static ssize_t fuse_dax_direct_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
+ ssize_t ret;
+
+ ret = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE);
+ if (ret < 0)
+ return ret;
+
+ fuse_invalidate_attr(inode);
+ fuse_write_update_size(inode, iocb->ki_pos);
+ return ret;
+}
+
+ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ ssize_t ret;
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!inode_trylock(inode))
+ return -EAGAIN;
+ } else {
+ inode_lock(inode);
+ }
+
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0)
+ goto out;
+
+ ret = file_remove_privs(iocb->ki_filp);
+ if (ret)
+ goto out;
+ /* TODO file_update_time() but we don't want metadata I/O */
+
+ /* Do not use dax for file extending writes as write and on
+ * disk i_size increase are not atomic otherwise.
+ */
+ if (file_extending_write(iocb, from))
+ ret = fuse_dax_direct_write(iocb, from);
+ else
+ ret = dax_iomap_rw(iocb, from, &fuse_iomap_ops);
+
+out:
+ inode_unlock(inode);
+
+ if (ret > 0)
+ ret = generic_write_sync(iocb, ret);
+ return ret;
+}
+
+static int fuse_dax_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+
+ struct inode *inode = mapping->host;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ return dax_writeback_mapping_range(mapping, fc->dax->dev, wbc);
+}
+
+static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf,
+ enum page_entry_size pe_size, bool write)
+{
+ vm_fault_t ret;
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+ struct super_block *sb = inode->i_sb;
+ pfn_t pfn;
+ int error = 0;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_conn_dax *fcd = fc->dax;
+ bool retry = false;
+
+ if (write)
+ sb_start_pagefault(sb);
+retry:
+ if (retry && !(fcd->nr_free_ranges > 0))
+ wait_event(fcd->range_waitq, (fcd->nr_free_ranges > 0));
+
+ /*
+ * We need to serialize against not only truncate but also against
+ * fuse dax memory range reclaim. While a range is being reclaimed,
+ * we do not want any read/write/mmap to make progress and try
+ * to populate page cache or access memory we are trying to free.
+ */
+ down_read(&get_fuse_inode(inode)->i_mmap_sem);
+ ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops);
+ if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) {
+ error = 0;
+ retry = true;
+ up_read(&get_fuse_inode(inode)->i_mmap_sem);
+ goto retry;
+ }
+
+ if (ret & VM_FAULT_NEEDDSYNC)
+ ret = dax_finish_sync_fault(vmf, pe_size, pfn);
+ up_read(&get_fuse_inode(inode)->i_mmap_sem);
+
+ if (write)
+ sb_end_pagefault(sb);
+
+ return ret;
+}
+
+static vm_fault_t fuse_dax_fault(struct vm_fault *vmf)
+{
+ return __fuse_dax_fault(vmf, PE_SIZE_PTE,
+ vmf->flags & FAULT_FLAG_WRITE);
+}
+
+static vm_fault_t fuse_dax_huge_fault(struct vm_fault *vmf,
+ enum page_entry_size pe_size)
+{
+ return __fuse_dax_fault(vmf, pe_size, vmf->flags & FAULT_FLAG_WRITE);
+}
+
+static vm_fault_t fuse_dax_page_mkwrite(struct vm_fault *vmf)
+{
+ return __fuse_dax_fault(vmf, PE_SIZE_PTE, true);
+}
+
+static vm_fault_t fuse_dax_pfn_mkwrite(struct vm_fault *vmf)
+{
+ return __fuse_dax_fault(vmf, PE_SIZE_PTE, true);
+}
+
+static const struct vm_operations_struct fuse_dax_vm_ops = {
+ .fault = fuse_dax_fault,
+ .huge_fault = fuse_dax_huge_fault,
+ .page_mkwrite = fuse_dax_page_mkwrite,
+ .pfn_mkwrite = fuse_dax_pfn_mkwrite,
+};
+
+int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ file_accessed(file);
+ vma->vm_ops = &fuse_dax_vm_ops;
+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+ return 0;
+}
+
+static int dmap_writeback_invalidate(struct inode *inode,
+ struct fuse_dax_mapping *dmap)
+{
+ int ret;
+ loff_t start_pos = dmap->itn.start << FUSE_DAX_SHIFT;
+ loff_t end_pos = (start_pos + FUSE_DAX_SZ - 1);
+
+ ret = filemap_fdatawrite_range(inode->i_mapping, start_pos, end_pos);
+ if (ret) {
+ pr_debug("fuse: filemap_fdatawrite_range() failed. err=%d start_pos=0x%llx, end_pos=0x%llx\n",
+ ret, start_pos, end_pos);
+ return ret;
+ }
+
+ ret = invalidate_inode_pages2_range(inode->i_mapping,
+ start_pos >> PAGE_SHIFT,
+ end_pos >> PAGE_SHIFT);
+ if (ret)
+ pr_debug("fuse: invalidate_inode_pages2_range() failed err=%d\n",
+ ret);
+
+ return ret;
+}
+
+static int reclaim_one_dmap_locked(struct inode *inode,
+ struct fuse_dax_mapping *dmap)
+{
+ int ret;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ /*
+ * igrab() was done to make sure inode won't go under us, and this
+ * further avoids the race with evict().
+ */
+ ret = dmap_writeback_invalidate(inode, dmap);
+ if (ret)
+ return ret;
+
+ /* Remove dax mapping from inode interval tree now */
+ interval_tree_remove(&dmap->itn, &fi->dax->tree);
+ fi->dax->nr--;
+
+ /* It is possible that umount/shutdown has killed the fuse connection
+ * and worker thread is trying to reclaim memory in parallel. Don't
+ * warn in that case.
+ */
+ ret = dmap_removemapping_one(inode, dmap);
+ if (ret && ret != -ENOTCONN) {
+ pr_warn("Failed to remove mapping. offset=0x%llx len=0x%llx ret=%d\n",
+ dmap->window_offset, dmap->length, ret);
+ }
+ return 0;
+}
+
+/* Find first mapped dmap for an inode and return file offset. Caller needs
+ * to hold fi->dax->sem lock either shared or exclusive.
+ */
+static struct fuse_dax_mapping *inode_lookup_first_dmap(struct inode *inode)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_dax_mapping *dmap;
+ struct interval_tree_node *node;
+
+ for (node = interval_tree_iter_first(&fi->dax->tree, 0, -1); node;
+ node = interval_tree_iter_next(node, 0, -1)) {
+ dmap = node_to_dmap(node);
+ /* still in use. */
+ if (refcount_read(&dmap->refcnt) > 1)
+ continue;
+
+ return dmap;
+ }
+
+ return NULL;
+}
+
+/*
+ * Find first mapping in the tree and free it and return it. Do not add
+ * it back to free pool.
+ */
+static struct fuse_dax_mapping *
+inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode,
+ bool *retry)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_dax_mapping *dmap;
+ u64 dmap_start, dmap_end;
+ unsigned long start_idx;
+ int ret;
+ struct interval_tree_node *node;
+
+ down_write(&fi->i_mmap_sem);
+
+ /* Lookup a dmap and corresponding file offset to reclaim. */
+ down_read(&fi->dax->sem);
+ dmap = inode_lookup_first_dmap(inode);
+ if (dmap) {
+ start_idx = dmap->itn.start;
+ dmap_start = start_idx << FUSE_DAX_SHIFT;
+ dmap_end = dmap_start + FUSE_DAX_SZ - 1;
+ }
+ up_read(&fi->dax->sem);
+
+ if (!dmap)
+ goto out_mmap_sem;
+ /*
+ * Make sure there are no references to inode pages using
+ * get_user_pages()
+ */
+ ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end);
+ if (ret) {
+ pr_debug("fuse: fuse_dax_break_layouts() failed. err=%d\n",
+ ret);
+ dmap = ERR_PTR(ret);
+ goto out_mmap_sem;
+ }
+
+ down_write(&fi->dax->sem);
+ node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx);
+ /* Range already got reclaimed by somebody else */
+ if (!node) {
+ if (retry)
+ *retry = true;
+ goto out_write_dmap_sem;
+ }
+
+ dmap = node_to_dmap(node);
+ /* still in use. */
+ if (refcount_read(&dmap->refcnt) > 1) {
+ dmap = NULL;
+ if (retry)
+ *retry = true;
+ goto out_write_dmap_sem;
+ }
+
+ ret = reclaim_one_dmap_locked(inode, dmap);
+ if (ret < 0) {
+ dmap = ERR_PTR(ret);
+ goto out_write_dmap_sem;
+ }
+
+ /* Clean up dmap. Do not add back to free list */
+ dmap_remove_busy_list(fcd, dmap);
+ dmap->inode = NULL;
+ dmap->itn.start = dmap->itn.last = 0;
+
+ pr_debug("fuse: %s: inline reclaimed memory range. inode=%p, window_offset=0x%llx, length=0x%llx\n",
+ __func__, inode, dmap->window_offset, dmap->length);
+
+out_write_dmap_sem:
+ up_write(&fi->dax->sem);
+out_mmap_sem:
+ up_write(&fi->i_mmap_sem);
+ return dmap;
+}
+
+static struct fuse_dax_mapping *
+alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode)
+{
+ struct fuse_dax_mapping *dmap;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ while (1) {
+ bool retry = false;
+
+ dmap = alloc_dax_mapping(fcd);
+ if (dmap)
+ return dmap;
+
+ dmap = inode_inline_reclaim_one_dmap(fcd, inode, &retry);
+ /*
+ * Either we got a mapping or it is an error, return in both
+ * the cases.
+ */
+ if (dmap)
+ return dmap;
+
+ /* If we could not reclaim a mapping because it
+ * had a reference or some other temporary failure,
+ * Try again. We want to give up inline reclaim only
+ * if there is no range assigned to this node. Otherwise
+ * if a deadlock is possible if we sleep with fi->i_mmap_sem
+ * held and worker to free memory can't make progress due
+ * to unavailability of fi->i_mmap_sem lock. So sleep
+ * only if fi->dax->nr=0
+ */
+ if (retry)
+ continue;
+ /*
+ * There are no mappings which can be reclaimed. Wait for one.
+ * We are not holding fi->dax->sem. So it is possible
+ * that range gets added now. But as we are not holding
+ * fi->i_mmap_sem, worker should still be able to free up
+ * a range and wake us up.
+ */
+ if (!fi->dax->nr && !(fcd->nr_free_ranges > 0)) {
+ if (wait_event_killable_exclusive(fcd->range_waitq,
+ (fcd->nr_free_ranges > 0))) {
+ return ERR_PTR(-EINTR);
+ }
+ }
+ }
+}
+
+static int lookup_and_reclaim_dmap_locked(struct fuse_conn_dax *fcd,
+ struct inode *inode,
+ unsigned long start_idx)
+{
+ int ret;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_dax_mapping *dmap;
+ struct interval_tree_node *node;
+
+ /* Find fuse dax mapping at file offset inode. */
+ node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx);
+
+ /* Range already got cleaned up by somebody else */
+ if (!node)
+ return 0;
+ dmap = node_to_dmap(node);
+
+ /* still in use. */
+ if (refcount_read(&dmap->refcnt) > 1)
+ return 0;
+
+ ret = reclaim_one_dmap_locked(inode, dmap);
+ if (ret < 0)
+ return ret;
+
+ /* Cleanup dmap entry and add back to free list */
+ spin_lock(&fcd->lock);
+ dmap_reinit_add_to_free_pool(fcd, dmap);
+ spin_unlock(&fcd->lock);
+ return ret;
+}
+
+/*
+ * Free a range of memory.
+ * Locking:
+ * 1. Take fi->i_mmap_sem to block dax faults.
+ * 2. Take fi->dax->sem to protect interval tree and also to make sure
+ * read/write can not reuse a dmap which we might be freeing.
+ */
+static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd,
+ struct inode *inode,
+ unsigned long start_idx,
+ unsigned long end_idx)
+{
+ int ret;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ loff_t dmap_start = start_idx << FUSE_DAX_SHIFT;
+ loff_t dmap_end = (dmap_start + FUSE_DAX_SZ) - 1;
+
+ down_write(&fi->i_mmap_sem);
+ ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end);
+ if (ret) {
+ pr_debug("virtio_fs: fuse_dax_break_layouts() failed. err=%d\n",
+ ret);
+ goto out_mmap_sem;
+ }
+
+ down_write(&fi->dax->sem);
+ ret = lookup_and_reclaim_dmap_locked(fcd, inode, start_idx);
+ up_write(&fi->dax->sem);
+out_mmap_sem:
+ up_write(&fi->i_mmap_sem);
+ return ret;
+}
+
+static int try_to_free_dmap_chunks(struct fuse_conn_dax *fcd,
+ unsigned long nr_to_free)
+{
+ struct fuse_dax_mapping *dmap, *pos, *temp;
+ int ret, nr_freed = 0;
+ unsigned long start_idx = 0, end_idx = 0;
+ struct inode *inode = NULL;
+
+ /* Pick first busy range and free it for now*/
+ while (1) {
+ if (nr_freed >= nr_to_free)
+ break;
+
+ dmap = NULL;
+ spin_lock(&fcd->lock);
+
+ if (!fcd->nr_busy_ranges) {
+ spin_unlock(&fcd->lock);
+ return 0;
+ }
+
+ list_for_each_entry_safe(pos, temp, &fcd->busy_ranges,
+ busy_list) {
+ /* skip this range if it's in use. */
+ if (refcount_read(&pos->refcnt) > 1)
+ continue;
+
+ inode = igrab(pos->inode);
+ /*
+ * This inode is going away. That will free
+ * up all the ranges anyway, continue to
+ * next range.
+ */
+ if (!inode)
+ continue;
+ /*
+ * Take this element off list and add it tail. If
+ * this element can't be freed, it will help with
+ * selecting new element in next iteration of loop.
+ */
+ dmap = pos;
+ list_move_tail(&dmap->busy_list, &fcd->busy_ranges);
+ start_idx = end_idx = dmap->itn.start;
+ break;
+ }
+ spin_unlock(&fcd->lock);
+ if (!dmap)
+ return 0;
+
+ ret = lookup_and_reclaim_dmap(fcd, inode, start_idx, end_idx);
+ iput(inode);
+ if (ret)
+ return ret;
+ nr_freed++;
+ }
+ return 0;
+}
+
+static void fuse_dax_free_mem_worker(struct work_struct *work)
+{
+ int ret;
+ struct fuse_conn_dax *fcd = container_of(work, struct fuse_conn_dax,
+ free_work.work);
+ ret = try_to_free_dmap_chunks(fcd, FUSE_DAX_RECLAIM_CHUNK);
+ if (ret) {
+ pr_debug("fuse: try_to_free_dmap_chunks() failed with err=%d\n",
+ ret);
+ }
+
+ /* If number of free ranges are still below threhold, requeue */
+ kick_dmap_free_worker(fcd, 1);
+}
+
+static void fuse_free_dax_mem_ranges(struct list_head *mem_list)
+{
+ struct fuse_dax_mapping *range, *temp;
+
+ /* Free All allocated elements */
+ list_for_each_entry_safe(range, temp, mem_list, list) {
+ list_del(&range->list);
+ if (!list_empty(&range->busy_list))
+ list_del(&range->busy_list);
+ kfree(range);
+ }
+}
+
+void fuse_dax_conn_free(struct fuse_conn *fc)
+{
+ if (fc->dax) {
+ fuse_free_dax_mem_ranges(&fc->dax->free_ranges);
+ kfree(fc->dax);
+ }
+}
+
+static int fuse_dax_mem_range_init(struct fuse_conn_dax *fcd)
+{
+ long nr_pages, nr_ranges;
+ void *kaddr;
+ pfn_t pfn;
+ struct fuse_dax_mapping *range;
+ int ret, id;
+ size_t dax_size = -1;
+ unsigned long i;
+
+ init_waitqueue_head(&fcd->range_waitq);
+ INIT_LIST_HEAD(&fcd->free_ranges);
+ INIT_LIST_HEAD(&fcd->busy_ranges);
+ INIT_DELAYED_WORK(&fcd->free_work, fuse_dax_free_mem_worker);
+
+ id = dax_read_lock();
+ nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), &kaddr,
+ &pfn);
+ dax_read_unlock(id);
+ if (nr_pages < 0) {
+ pr_debug("dax_direct_access() returned %ld\n", nr_pages);
+ return nr_pages;
+ }
+
+ nr_ranges = nr_pages/FUSE_DAX_PAGES;
+ pr_debug("%s: dax mapped %ld pages. nr_ranges=%ld\n",
+ __func__, nr_pages, nr_ranges);
+
+ for (i = 0; i < nr_ranges; i++) {
+ range = kzalloc(sizeof(struct fuse_dax_mapping), GFP_KERNEL);
+ ret = -ENOMEM;
+ if (!range)
+ goto out_err;
+
+ /* TODO: This offset only works if virtio-fs driver is not
+ * having some memory hidden at the beginning. This needs
+ * better handling
+ */
+ range->window_offset = i * FUSE_DAX_SZ;
+ range->length = FUSE_DAX_SZ;
+ INIT_LIST_HEAD(&range->busy_list);
+ refcount_set(&range->refcnt, 1);
+ list_add_tail(&range->list, &fcd->free_ranges);
+ }
+
+ fcd->nr_free_ranges = nr_ranges;
+ fcd->nr_ranges = nr_ranges;
+ return 0;
+out_err:
+ /* Free All allocated elements */
+ fuse_free_dax_mem_ranges(&fcd->free_ranges);
+ return ret;
+}
+
+int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev)
+{
+ struct fuse_conn_dax *fcd;
+ int err;
+
+ if (!dax_dev)
+ return 0;
+
+ fcd = kzalloc(sizeof(*fcd), GFP_KERNEL);
+ if (!fcd)
+ return -ENOMEM;
+
+ spin_lock_init(&fcd->lock);
+ fcd->dev = dax_dev;
+ err = fuse_dax_mem_range_init(fcd);
+ if (err) {
+ kfree(fcd);
+ return err;
+ }
+
+ fc->dax = fcd;
+ return 0;
+}
+
+bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi)
+{
+ struct fuse_conn *fc = get_fuse_conn_super(sb);
+
+ fi->dax = NULL;
+ if (fc->dax) {
+ fi->dax = kzalloc(sizeof(*fi->dax), GFP_KERNEL_ACCOUNT);
+ if (!fi->dax)
+ return false;
+
+ init_rwsem(&fi->dax->sem);
+ fi->dax->tree = RB_ROOT_CACHED;
+ }
+
+ return true;
+}
+
+static const struct address_space_operations fuse_dax_file_aops = {
+ .writepages = fuse_dax_writepages,
+ .direct_IO = noop_direct_IO,
+ .set_page_dirty = noop_set_page_dirty,
+ .invalidatepage = noop_invalidatepage,
+};
+
+void fuse_dax_inode_init(struct inode *inode)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ if (!fc->dax)
+ return;
+
+ inode->i_flags |= S_DAX;
+ inode->i_data.a_ops = &fuse_dax_file_aops;
+}
+
+bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment)
+{
+ if (fc->dax && (map_alignment > FUSE_DAX_SHIFT)) {
+ pr_warn("FUSE: map_alignment %u incompatible with dax mem range size %u\n",
+ map_alignment, FUSE_DAX_SZ);
+ return false;
+ }
+ return true;
+}
+
+void fuse_dax_cancel_work(struct fuse_conn *fc)
+{
+ struct fuse_conn_dax *fcd = fc->dax;
+
+ if (fcd)
+ cancel_delayed_work_sync(&fcd->free_work);
+
+}
+EXPORT_SYMBOL_GPL(fuse_dax_cancel_work);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 02b3c36b3676..588f8d1240aa 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -40,20 +40,21 @@ static struct fuse_dev *fuse_get_dev(struct file *file)
return READ_ONCE(file->private_data);
}
-static void fuse_request_init(struct fuse_req *req)
+static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req)
{
INIT_LIST_HEAD(&req->list);
INIT_LIST_HEAD(&req->intr_entry);
init_waitqueue_head(&req->waitq);
refcount_set(&req->count, 1);
__set_bit(FR_PENDING, &req->flags);
+ req->fm = fm;
}
-static struct fuse_req *fuse_request_alloc(gfp_t flags)
+static struct fuse_req *fuse_request_alloc(struct fuse_mount *fm, gfp_t flags)
{
struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags);
if (req)
- fuse_request_init(req);
+ fuse_request_init(fm, req);
return req;
}
@@ -100,10 +101,11 @@ static void fuse_drop_waiting(struct fuse_conn *fc)
}
}
-static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
+static void fuse_put_request(struct fuse_req *req);
-static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background)
+static struct fuse_req *fuse_get_req(struct fuse_mount *fm, bool for_background)
{
+ struct fuse_conn *fc = fm->fc;
struct fuse_req *req;
int err;
atomic_inc(&fc->num_waiting);
@@ -125,7 +127,7 @@ static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background)
if (fc->conn_error)
goto out;
- req = fuse_request_alloc(GFP_KERNEL);
+ req = fuse_request_alloc(fm, GFP_KERNEL);
err = -ENOMEM;
if (!req) {
if (for_background)
@@ -143,7 +145,7 @@ static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background)
if (unlikely(req->in.h.uid == ((uid_t)-1) ||
req->in.h.gid == ((gid_t)-1))) {
- fuse_put_request(fc, req);
+ fuse_put_request(req);
return ERR_PTR(-EOVERFLOW);
}
return req;
@@ -153,8 +155,10 @@ static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background)
return ERR_PTR(err);
}
-static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_put_request(struct fuse_req *req)
{
+ struct fuse_conn *fc = req->fm->fc;
+
if (refcount_dec_and_test(&req->count)) {
if (test_bit(FR_BACKGROUND, &req->flags)) {
/*
@@ -273,8 +277,10 @@ static void flush_bg_queue(struct fuse_conn *fc)
* the 'end' callback is called if given, else the reference to the
* request is released
*/
-void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_end(struct fuse_req *req)
{
+ struct fuse_mount *fm = req->fm;
+ struct fuse_conn *fc = fm->fc;
struct fuse_iqueue *fiq = &fc->iq;
if (test_and_set_bit(FR_FINISHED, &req->flags))
@@ -309,9 +315,9 @@ void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req)
wake_up(&fc->blocked_waitq);
}
- if (fc->num_background == fc->congestion_threshold && fc->sb) {
- clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
- clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
+ if (fc->num_background == fc->congestion_threshold && fm->sb) {
+ clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
+ clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
}
fc->num_background--;
fc->active_background--;
@@ -323,14 +329,16 @@ void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req)
}
if (test_bit(FR_ASYNC, &req->flags))
- req->args->end(fc, req->args, req->out.h.error);
+ req->args->end(fm, req->args, req->out.h.error);
put_request:
- fuse_put_request(fc, req);
+ fuse_put_request(req);
}
EXPORT_SYMBOL_GPL(fuse_request_end);
-static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
+static int queue_interrupt(struct fuse_req *req)
{
+ struct fuse_iqueue *fiq = &req->fm->fc->iq;
+
spin_lock(&fiq->lock);
/* Check for we've sent request to interrupt this req */
if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) {
@@ -357,8 +365,9 @@ static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
return 0;
}
-static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
+static void request_wait_answer(struct fuse_req *req)
{
+ struct fuse_conn *fc = req->fm->fc;
struct fuse_iqueue *fiq = &fc->iq;
int err;
@@ -373,7 +382,7 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
/* matches barrier in fuse_dev_do_read() */
smp_mb__after_atomic();
if (test_bit(FR_SENT, &req->flags))
- queue_interrupt(fiq, req);
+ queue_interrupt(req);
}
if (!test_bit(FR_FORCE, &req->flags)) {
@@ -402,9 +411,9 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
wait_event(req->waitq, test_bit(FR_FINISHED, &req->flags));
}
-static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
+static void __fuse_request_send(struct fuse_req *req)
{
- struct fuse_iqueue *fiq = &fc->iq;
+ struct fuse_iqueue *fiq = &req->fm->fc->iq;
BUG_ON(test_bit(FR_BACKGROUND, &req->flags));
spin_lock(&fiq->lock);
@@ -418,7 +427,7 @@ static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
__fuse_get_request(req);
queue_request_and_unlock(fiq, req);
- request_wait_answer(fc, req);
+ request_wait_answer(req);
/* Pairs with smp_wmb() in fuse_request_end() */
smp_rmb();
}
@@ -457,8 +466,10 @@ static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args)
}
}
-static void fuse_force_creds(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_force_creds(struct fuse_req *req)
{
+ struct fuse_conn *fc = req->fm->fc;
+
req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid());
req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid());
req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);
@@ -473,23 +484,24 @@ static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args)
__set_bit(FR_ASYNC, &req->flags);
}
-ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args)
+ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args)
{
+ struct fuse_conn *fc = fm->fc;
struct fuse_req *req;
ssize_t ret;
if (args->force) {
atomic_inc(&fc->num_waiting);
- req = fuse_request_alloc(GFP_KERNEL | __GFP_NOFAIL);
+ req = fuse_request_alloc(fm, GFP_KERNEL | __GFP_NOFAIL);
if (!args->nocreds)
- fuse_force_creds(fc, req);
+ fuse_force_creds(req);
__set_bit(FR_WAITING, &req->flags);
__set_bit(FR_FORCE, &req->flags);
} else {
WARN_ON(args->nocreds);
- req = fuse_get_req(fc, false);
+ req = fuse_get_req(fm, false);
if (IS_ERR(req))
return PTR_ERR(req);
}
@@ -500,20 +512,21 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args)
if (!args->noreply)
__set_bit(FR_ISREPLY, &req->flags);
- __fuse_request_send(fc, req);
+ __fuse_request_send(req);
ret = req->out.h.error;
if (!ret && args->out_argvar) {
BUG_ON(args->out_numargs == 0);
ret = args->out_args[args->out_numargs - 1].size;
}
- fuse_put_request(fc, req);
+ fuse_put_request(req);
return ret;
}
-static bool fuse_request_queue_background(struct fuse_conn *fc,
- struct fuse_req *req)
+static bool fuse_request_queue_background(struct fuse_req *req)
{
+ struct fuse_mount *fm = req->fm;
+ struct fuse_conn *fc = fm->fc;
bool queued = false;
WARN_ON(!test_bit(FR_BACKGROUND, &req->flags));
@@ -527,9 +540,9 @@ static bool fuse_request_queue_background(struct fuse_conn *fc,
fc->num_background++;
if (fc->num_background == fc->max_background)
fc->blocked = 1;
- if (fc->num_background == fc->congestion_threshold && fc->sb) {
- set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
- set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
+ if (fc->num_background == fc->congestion_threshold && fm->sb) {
+ set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
+ set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
}
list_add_tail(&req->list, &fc->bg_queue);
flush_bg_queue(fc);
@@ -540,28 +553,28 @@ static bool fuse_request_queue_background(struct fuse_conn *fc,
return queued;
}
-int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args,
+int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args,
gfp_t gfp_flags)
{
struct fuse_req *req;
if (args->force) {
WARN_ON(!args->nocreds);
- req = fuse_request_alloc(gfp_flags);
+ req = fuse_request_alloc(fm, gfp_flags);
if (!req)
return -ENOMEM;
__set_bit(FR_BACKGROUND, &req->flags);
} else {
WARN_ON(args->nocreds);
- req = fuse_get_req(fc, true);
+ req = fuse_get_req(fm, true);
if (IS_ERR(req))
return PTR_ERR(req);
}
fuse_args_to_req(req, args);
- if (!fuse_request_queue_background(fc, req)) {
- fuse_put_request(fc, req);
+ if (!fuse_request_queue_background(req)) {
+ fuse_put_request(req);
return -ENOTCONN;
}
@@ -569,14 +582,14 @@ int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args,
}
EXPORT_SYMBOL_GPL(fuse_simple_background);
-static int fuse_simple_notify_reply(struct fuse_conn *fc,
+static int fuse_simple_notify_reply(struct fuse_mount *fm,
struct fuse_args *args, u64 unique)
{
struct fuse_req *req;
- struct fuse_iqueue *fiq = &fc->iq;
+ struct fuse_iqueue *fiq = &fm->fc->iq;
int err = 0;
- req = fuse_get_req(fc, false);
+ req = fuse_get_req(fm, false);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -591,7 +604,7 @@ static int fuse_simple_notify_reply(struct fuse_conn *fc,
} else {
err = -ENODEV;
spin_unlock(&fiq->lock);
- fuse_put_request(fc, req);
+ fuse_put_request(req);
}
return err;
@@ -785,15 +798,16 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
struct page *newpage;
struct pipe_buffer *buf = cs->pipebufs;
+ get_page(oldpage);
err = unlock_request(cs->req);
if (err)
- return err;
+ goto out_put_old;
fuse_copy_finish(cs);
err = pipe_buf_confirm(cs->pipe, buf);
if (err)
- return err;
+ goto out_put_old;
BUG_ON(!cs->nr_segs);
cs->currbuf = buf;
@@ -833,7 +847,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
err = replace_page_cache_page(oldpage, newpage, GFP_KERNEL);
if (err) {
unlock_page(newpage);
- return err;
+ goto out_put_old;
}
get_page(newpage);
@@ -852,14 +866,19 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
if (err) {
unlock_page(newpage);
put_page(newpage);
- return err;
+ goto out_put_old;
}
unlock_page(oldpage);
+ /* Drop ref for ap->pages[] array */
put_page(oldpage);
cs->len = 0;
- return 0;
+ err = 0;
+out_put_old:
+ /* Drop ref obtained in this function */
+ put_page(oldpage);
+ return err;
out_fallback_unlock:
unlock_page(newpage);
@@ -868,10 +887,10 @@ out_fallback:
cs->offset = buf->offset;
err = lock_request(cs->req);
- if (err)
- return err;
+ if (!err)
+ err = 1;
- return 1;
+ goto out_put_old;
}
static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
@@ -883,14 +902,16 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
if (cs->nr_segs >= cs->pipe->max_usage)
return -EIO;
+ get_page(page);
err = unlock_request(cs->req);
- if (err)
+ if (err) {
+ put_page(page);
return err;
+ }
fuse_copy_finish(cs);
buf = cs->pipebufs;
- get_page(page);
buf->page = page;
buf->offset = offset;
buf->len = count;
@@ -1250,7 +1271,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
/* SETXATTR is special, since it may contain too large data */
if (args->opcode == FUSE_SETXATTR)
req->out.h.error = -E2BIG;
- fuse_request_end(fc, req);
+ fuse_request_end(req);
goto restart;
}
spin_lock(&fpq->lock);
@@ -1284,8 +1305,8 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
/* matches barrier in request_wait_answer() */
smp_mb__after_atomic();
if (test_bit(FR_INTERRUPTED, &req->flags))
- queue_interrupt(fiq, req);
- fuse_put_request(fc, req);
+ queue_interrupt(req);
+ fuse_put_request(req);
return reqsize;
@@ -1293,7 +1314,7 @@ out_end:
if (!test_bit(FR_PRIVATE, &req->flags))
list_del_init(&req->list);
spin_unlock(&fpq->lock);
- fuse_request_end(fc, req);
+ fuse_request_end(req);
return err;
err_unlock:
@@ -1416,11 +1437,8 @@ static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size,
fuse_copy_finish(cs);
down_read(&fc->killsb);
- err = -ENOENT;
- if (fc->sb) {
- err = fuse_reverse_inval_inode(fc->sb, outarg.ino,
- outarg.off, outarg.len);
- }
+ err = fuse_reverse_inval_inode(fc, outarg.ino,
+ outarg.off, outarg.len);
up_read(&fc->killsb);
return err;
@@ -1466,9 +1484,7 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
buf[outarg.namelen] = 0;
down_read(&fc->killsb);
- err = -ENOENT;
- if (fc->sb)
- err = fuse_reverse_inval_entry(fc->sb, outarg.parent, 0, &name);
+ err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name);
up_read(&fc->killsb);
kfree(buf);
return err;
@@ -1516,10 +1532,7 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
buf[outarg.namelen] = 0;
down_read(&fc->killsb);
- err = -ENOENT;
- if (fc->sb)
- err = fuse_reverse_inval_entry(fc->sb, outarg.parent,
- outarg.child, &name);
+ err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name);
up_read(&fc->killsb);
kfree(buf);
return err;
@@ -1561,10 +1574,7 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
down_read(&fc->killsb);
err = -ENOENT;
- if (!fc->sb)
- goto out_up_killsb;
-
- inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
+ inode = fuse_ilookup(fc, nodeid, NULL);
if (!inode)
goto out_up_killsb;
@@ -1621,7 +1631,7 @@ struct fuse_retrieve_args {
struct fuse_notify_retrieve_in inarg;
};
-static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_args *args,
+static void fuse_retrieve_end(struct fuse_mount *fm, struct fuse_args *args,
int error)
{
struct fuse_retrieve_args *ra =
@@ -1631,7 +1641,7 @@ static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_args *args,
kfree(ra);
}
-static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
+static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode,
struct fuse_notify_retrieve_out *outarg)
{
int err;
@@ -1642,6 +1652,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
unsigned int offset;
size_t total_len = 0;
unsigned int num_pages;
+ struct fuse_conn *fc = fm->fc;
struct fuse_retrieve_args *ra;
size_t args_size = sizeof(*ra);
struct fuse_args_pages *ap;
@@ -1703,9 +1714,9 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
args->in_args[0].value = &ra->inarg;
args->in_args[1].size = total_len;
- err = fuse_simple_notify_reply(fc, args, outarg->notify_unique);
+ err = fuse_simple_notify_reply(fm, args, outarg->notify_unique);
if (err)
- fuse_retrieve_end(fc, args, err);
+ fuse_retrieve_end(fm, args, err);
return err;
}
@@ -1714,7 +1725,9 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size,
struct fuse_copy_state *cs)
{
struct fuse_notify_retrieve_out outarg;
+ struct fuse_mount *fm;
struct inode *inode;
+ u64 nodeid;
int err;
err = -EINVAL;
@@ -1729,14 +1742,12 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size,
down_read(&fc->killsb);
err = -ENOENT;
- if (fc->sb) {
- u64 nodeid = outarg.nodeid;
+ nodeid = outarg.nodeid;
- inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
- if (inode) {
- err = fuse_retrieve(fc, inode, &outarg);
- iput(inode);
- }
+ inode = fuse_ilookup(fc, nodeid, &fm);
+ if (inode) {
+ err = fuse_retrieve(fm, inode, &outarg);
+ iput(inode);
}
up_read(&fc->killsb);
@@ -1875,9 +1886,9 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
else if (oh.error == -ENOSYS)
fc->no_interrupt = 1;
else if (oh.error == -EAGAIN)
- err = queue_interrupt(&fc->iq, req);
+ err = queue_interrupt(req);
- fuse_put_request(fc, req);
+ fuse_put_request(req);
goto copy_finish;
}
@@ -1907,7 +1918,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
list_del_init(&req->list);
spin_unlock(&fpq->lock);
- fuse_request_end(fc, req);
+ fuse_request_end(req);
out:
return err ? err : nbytes;
@@ -2045,7 +2056,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait)
}
/* Abort all requests on the given list (pending or processing) */
-static void end_requests(struct fuse_conn *fc, struct list_head *head)
+static void end_requests(struct list_head *head)
{
while (!list_empty(head)) {
struct fuse_req *req;
@@ -2053,7 +2064,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head)
req->out.h.error = -ECONNABORTED;
clear_bit(FR_SENT, &req->flags);
list_del_init(&req->list);
- fuse_request_end(fc, req);
+ fuse_request_end(req);
}
}
@@ -2148,7 +2159,7 @@ void fuse_abort_conn(struct fuse_conn *fc)
wake_up_all(&fc->blocked_waitq);
spin_unlock(&fc->lock);
- end_requests(fc, &to_end);
+ end_requests(&to_end);
} else {
spin_unlock(&fc->lock);
}
@@ -2178,7 +2189,7 @@ int fuse_dev_release(struct inode *inode, struct file *file)
list_splice_init(&fpq->processing[i], &to_end);
spin_unlock(&fpq->lock);
- end_requests(fc, &to_end);
+ end_requests(&to_end);
/* Are we the last open device? */
if (atomic_dec_and_test(&fc->dev_count)) {
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 26f028bc760b..ff7dbeb16f88 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -10,6 +10,7 @@
#include <linux/pagemap.h>
#include <linux/file.h>
+#include <linux/fs_context.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include <linux/slab.h>
@@ -196,7 +197,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
{
struct inode *inode;
struct dentry *parent;
- struct fuse_conn *fc;
+ struct fuse_mount *fm;
struct fuse_inode *fi;
int ret;
@@ -218,27 +219,29 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
if (flags & LOOKUP_RCU)
goto out;
- fc = get_fuse_conn(inode);
+ fm = get_fuse_mount(inode);
forget = fuse_alloc_forget();
ret = -ENOMEM;
if (!forget)
goto out;
- attr_version = fuse_get_attr_version(fc);
+ attr_version = fuse_get_attr_version(fm->fc);
parent = dget_parent(entry);
- fuse_lookup_init(fc, &args, get_node_id(d_inode(parent)),
+ fuse_lookup_init(fm->fc, &args, get_node_id(d_inode(parent)),
&entry->d_name, &outarg);
- ret = fuse_simple_request(fc, &args);
+ ret = fuse_simple_request(fm, &args);
dput(parent);
/* Zero nodeid is same as -ENOENT */
if (!ret && !outarg.nodeid)
ret = -ENOENT;
if (!ret) {
fi = get_fuse_inode(inode);
- if (outarg.nodeid != get_node_id(inode)) {
- fuse_queue_forget(fc, forget, outarg.nodeid, 1);
+ if (outarg.nodeid != get_node_id(inode) ||
+ (bool) IS_AUTOMOUNT(inode) != (bool) (outarg.attr.flags & FUSE_ATTR_SUBMOUNT)) {
+ fuse_queue_forget(fm->fc, forget,
+ outarg.nodeid, 1);
goto invalid;
}
spin_lock(&fi->lock);
@@ -298,6 +301,79 @@ static int fuse_dentry_delete(const struct dentry *dentry)
return time_before64(fuse_dentry_time(dentry), get_jiffies_64());
}
+/*
+ * Create a fuse_mount object with a new superblock (with path->dentry
+ * as the root), and return that mount so it can be auto-mounted on
+ * @path.
+ */
+static struct vfsmount *fuse_dentry_automount(struct path *path)
+{
+ struct fs_context *fsc;
+ struct fuse_mount *parent_fm = get_fuse_mount_super(path->mnt->mnt_sb);
+ struct fuse_conn *fc = parent_fm->fc;
+ struct fuse_mount *fm;
+ struct vfsmount *mnt;
+ struct fuse_inode *mp_fi = get_fuse_inode(d_inode(path->dentry));
+ struct super_block *sb;
+ int err;
+
+ fsc = fs_context_for_submount(path->mnt->mnt_sb->s_type, path->dentry);
+ if (IS_ERR(fsc)) {
+ err = PTR_ERR(fsc);
+ goto out;
+ }
+
+ err = -ENOMEM;
+ fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL);
+ if (!fm)
+ goto out_put_fsc;
+
+ refcount_set(&fm->count, 1);
+ fsc->s_fs_info = fm;
+ sb = sget_fc(fsc, NULL, set_anon_super_fc);
+ if (IS_ERR(sb)) {
+ err = PTR_ERR(sb);
+ fuse_mount_put(fm);
+ goto out_put_fsc;
+ }
+ fm->fc = fuse_conn_get(fc);
+
+ /* Initialize superblock, making @mp_fi its root */
+ err = fuse_fill_super_submount(sb, mp_fi);
+ if (err)
+ goto out_put_sb;
+
+ sb->s_flags |= SB_ACTIVE;
+ fsc->root = dget(sb->s_root);
+ /* We are done configuring the superblock, so unlock it */
+ up_write(&sb->s_umount);
+
+ down_write(&fc->killsb);
+ list_add_tail(&fm->fc_entry, &fc->mounts);
+ up_write(&fc->killsb);
+
+ /* Create the submount */
+ mnt = vfs_create_mount(fsc);
+ if (IS_ERR(mnt)) {
+ err = PTR_ERR(mnt);
+ goto out_put_fsc;
+ }
+ mntget(mnt);
+ put_fs_context(fsc);
+ return mnt;
+
+out_put_sb:
+ /*
+ * Only jump here when fsc->root is NULL and sb is still locked
+ * (otherwise put_fs_context() will put the superblock)
+ */
+ deactivate_locked_super(sb);
+out_put_fsc:
+ put_fs_context(fsc);
+out:
+ return ERR_PTR(err);
+}
+
const struct dentry_operations fuse_dentry_operations = {
.d_revalidate = fuse_dentry_revalidate,
.d_delete = fuse_dentry_delete,
@@ -305,6 +381,7 @@ const struct dentry_operations fuse_dentry_operations = {
.d_init = fuse_dentry_init,
.d_release = fuse_dentry_release,
#endif
+ .d_automount = fuse_dentry_automount,
};
const struct dentry_operations fuse_root_dentry_operations = {
@@ -329,7 +406,7 @@ bool fuse_invalid_attr(struct fuse_attr *attr)
int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name,
struct fuse_entry_out *outarg, struct inode **inode)
{
- struct fuse_conn *fc = get_fuse_conn_super(sb);
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
FUSE_ARGS(args);
struct fuse_forget_link *forget;
u64 attr_version;
@@ -346,10 +423,10 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name
if (!forget)
goto out;
- attr_version = fuse_get_attr_version(fc);
+ attr_version = fuse_get_attr_version(fm->fc);
- fuse_lookup_init(fc, &args, nodeid, name, outarg);
- err = fuse_simple_request(fc, &args);
+ fuse_lookup_init(fm->fc, &args, nodeid, name, outarg);
+ err = fuse_simple_request(fm, &args);
/* Zero nodeid is same as -ENOENT, but with valid timeout */
if (err || !outarg->nodeid)
goto out_put_forget;
@@ -365,7 +442,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name
attr_version);
err = -ENOMEM;
if (!*inode) {
- fuse_queue_forget(fc, forget, outarg->nodeid, 1);
+ fuse_queue_forget(fm->fc, forget, outarg->nodeid, 1);
goto out;
}
err = 0;
@@ -434,7 +511,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
{
int err;
struct inode *inode;
- struct fuse_conn *fc = get_fuse_conn(dir);
+ struct fuse_mount *fm = get_fuse_mount(dir);
FUSE_ARGS(args);
struct fuse_forget_link *forget;
struct fuse_create_in inarg;
@@ -452,11 +529,11 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
goto out_err;
err = -ENOMEM;
- ff = fuse_file_alloc(fc);
+ ff = fuse_file_alloc(fm);
if (!ff)
goto out_put_forget_req;
- if (!fc->dont_mask)
+ if (!fm->fc->dont_mask)
mode &= ~current_umask();
flags &= ~O_NOCTTY;
@@ -477,7 +554,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
args.out_args[0].value = &outentry;
args.out_args[1].size = sizeof(outopen);
args.out_args[1].value = &outopen;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err)
goto out_free_ff;
@@ -494,7 +571,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
if (!inode) {
flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
fuse_sync_release(NULL, ff, flags);
- fuse_queue_forget(fc, forget, outentry.nodeid, 1);
+ fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1);
err = -ENOMEM;
goto out_err;
}
@@ -567,7 +644,7 @@ no_open:
/*
* Code shared between mknod, mkdir, symlink and link
*/
-static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args,
+static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args,
struct inode *dir, struct dentry *entry,
umode_t mode)
{
@@ -586,7 +663,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args,
args->out_numargs = 1;
args->out_args[0].size = sizeof(outarg);
args->out_args[0].value = &outarg;
- err = fuse_simple_request(fc, args);
+ err = fuse_simple_request(fm, args);
if (err)
goto out_put_forget_req;
@@ -600,7 +677,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args,
inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
&outarg.attr, entry_attr_timeout(&outarg), 0);
if (!inode) {
- fuse_queue_forget(fc, forget, outarg.nodeid, 1);
+ fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1);
return -ENOMEM;
}
kfree(forget);
@@ -628,10 +705,10 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode,
dev_t rdev)
{
struct fuse_mknod_in inarg;
- struct fuse_conn *fc = get_fuse_conn(dir);
+ struct fuse_mount *fm = get_fuse_mount(dir);
FUSE_ARGS(args);
- if (!fc->dont_mask)
+ if (!fm->fc->dont_mask)
mode &= ~current_umask();
memset(&inarg, 0, sizeof(inarg));
@@ -644,7 +721,7 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode,
args.in_args[0].value = &inarg;
args.in_args[1].size = entry->d_name.len + 1;
args.in_args[1].value = entry->d_name.name;
- return create_new_entry(fc, &args, dir, entry, mode);
+ return create_new_entry(fm, &args, dir, entry, mode);
}
static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode,
@@ -656,10 +733,10 @@ static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode,
static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode)
{
struct fuse_mkdir_in inarg;
- struct fuse_conn *fc = get_fuse_conn(dir);
+ struct fuse_mount *fm = get_fuse_mount(dir);
FUSE_ARGS(args);
- if (!fc->dont_mask)
+ if (!fm->fc->dont_mask)
mode &= ~current_umask();
memset(&inarg, 0, sizeof(inarg));
@@ -671,13 +748,13 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode)
args.in_args[0].value = &inarg;
args.in_args[1].size = entry->d_name.len + 1;
args.in_args[1].value = entry->d_name.name;
- return create_new_entry(fc, &args, dir, entry, S_IFDIR);
+ return create_new_entry(fm, &args, dir, entry, S_IFDIR);
}
static int fuse_symlink(struct inode *dir, struct dentry *entry,
const char *link)
{
- struct fuse_conn *fc = get_fuse_conn(dir);
+ struct fuse_mount *fm = get_fuse_mount(dir);
unsigned len = strlen(link) + 1;
FUSE_ARGS(args);
@@ -687,7 +764,7 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,
args.in_args[0].value = entry->d_name.name;
args.in_args[1].size = len;
args.in_args[1].value = link;
- return create_new_entry(fc, &args, dir, entry, S_IFLNK);
+ return create_new_entry(fm, &args, dir, entry, S_IFLNK);
}
void fuse_update_ctime(struct inode *inode)
@@ -701,7 +778,7 @@ void fuse_update_ctime(struct inode *inode)
static int fuse_unlink(struct inode *dir, struct dentry *entry)
{
int err;
- struct fuse_conn *fc = get_fuse_conn(dir);
+ struct fuse_mount *fm = get_fuse_mount(dir);
FUSE_ARGS(args);
args.opcode = FUSE_UNLINK;
@@ -709,13 +786,13 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
args.in_numargs = 1;
args.in_args[0].size = entry->d_name.len + 1;
args.in_args[0].value = entry->d_name.name;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (!err) {
struct inode *inode = d_inode(entry);
struct fuse_inode *fi = get_fuse_inode(inode);
spin_lock(&fi->lock);
- fi->attr_version = atomic64_inc_return(&fc->attr_version);
+ fi->attr_version = atomic64_inc_return(&fm->fc->attr_version);
/*
* If i_nlink == 0 then unlink doesn't make sense, yet this can
* happen if userspace filesystem is careless. It would be
@@ -737,7 +814,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
static int fuse_rmdir(struct inode *dir, struct dentry *entry)
{
int err;
- struct fuse_conn *fc = get_fuse_conn(dir);
+ struct fuse_mount *fm = get_fuse_mount(dir);
FUSE_ARGS(args);
args.opcode = FUSE_RMDIR;
@@ -745,7 +822,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
args.in_numargs = 1;
args.in_args[0].size = entry->d_name.len + 1;
args.in_args[0].value = entry->d_name.name;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (!err) {
clear_nlink(d_inode(entry));
fuse_dir_changed(dir);
@@ -761,7 +838,7 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
{
int err;
struct fuse_rename2_in inarg;
- struct fuse_conn *fc = get_fuse_conn(olddir);
+ struct fuse_mount *fm = get_fuse_mount(olddir);
FUSE_ARGS(args);
memset(&inarg, 0, argsize);
@@ -776,7 +853,7 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
args.in_args[1].value = oldent->d_name.name;
args.in_args[2].size = newent->d_name.len + 1;
args.in_args[2].value = newent->d_name.name;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (!err) {
/* ctime changes */
fuse_invalidate_attr(d_inode(oldent));
@@ -847,7 +924,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
int err;
struct fuse_link_in inarg;
struct inode *inode = d_inode(entry);
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
memset(&inarg, 0, sizeof(inarg));
@@ -858,7 +935,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
args.in_args[0].value = &inarg;
args.in_args[1].size = newent->d_name.len + 1;
args.in_args[1].value = newent->d_name.name;
- err = create_new_entry(fc, &args, newdir, newent, inode->i_mode);
+ err = create_new_entry(fm, &args, newdir, newent, inode->i_mode);
/* Contrary to "normal" filesystems it can happen that link
makes two "logical" inodes point to the same "physical"
inode. We invalidate the attributes of the old one, so it
@@ -869,7 +946,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
struct fuse_inode *fi = get_fuse_inode(inode);
spin_lock(&fi->lock);
- fi->attr_version = atomic64_inc_return(&fc->attr_version);
+ fi->attr_version = atomic64_inc_return(&fm->fc->attr_version);
if (likely(inode->i_nlink < UINT_MAX))
inc_nlink(inode);
spin_unlock(&fi->lock);
@@ -926,11 +1003,11 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
int err;
struct fuse_getattr_in inarg;
struct fuse_attr_out outarg;
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
u64 attr_version;
- attr_version = fuse_get_attr_version(fc);
+ attr_version = fuse_get_attr_version(fm->fc);
memset(&inarg, 0, sizeof(inarg));
memset(&outarg, 0, sizeof(outarg));
@@ -949,7 +1026,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (!err) {
if (fuse_invalid_attr(&outarg.attr) ||
(inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
@@ -1002,7 +1079,7 @@ int fuse_update_attributes(struct inode *inode, struct file *file)
STATX_BASIC_STATS & ~STATX_ATIME, 0);
}
-int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
+int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
u64 child_nodeid, struct qstr *name)
{
int err = -ENOTDIR;
@@ -1010,7 +1087,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
struct dentry *dir;
struct dentry *entry;
- parent = ilookup5(sb, parent_nodeid, fuse_inode_eq, &parent_nodeid);
+ parent = fuse_ilookup(fc, parent_nodeid, NULL);
if (!parent)
return -ENOENT;
@@ -1102,14 +1179,14 @@ int fuse_allow_current_process(struct fuse_conn *fc)
static int fuse_access(struct inode *inode, int mask)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_access_in inarg;
int err;
BUG_ON(mask & MAY_NOT_BLOCK);
- if (fc->no_access)
+ if (fm->fc->no_access)
return 0;
memset(&inarg, 0, sizeof(inarg));
@@ -1119,9 +1196,9 @@ static int fuse_access(struct inode *inode, int mask)
args.in_numargs = 1;
args.in_args[0].size = sizeof(inarg);
args.in_args[0].value = &inarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
- fc->no_access = 1;
+ fm->fc->no_access = 1;
err = 0;
}
return err;
@@ -1209,7 +1286,7 @@ static int fuse_permission(struct inode *inode, int mask)
static int fuse_readlink_page(struct inode *inode, struct page *page)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_page_desc desc = { .length = PAGE_SIZE - 1 };
struct fuse_args_pages ap = {
.num_pages = 1,
@@ -1226,7 +1303,7 @@ static int fuse_readlink_page(struct inode *inode, struct page *page)
ap.args.page_zeroing = true;
ap.args.out_numargs = 1;
ap.args.out_args[0].size = desc.length;
- res = fuse_simple_request(fc, &ap.args);
+ res = fuse_simple_request(fm, &ap.args);
fuse_invalidate_atime(inode);
@@ -1454,7 +1531,7 @@ static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args,
*/
int fuse_flush_times(struct inode *inode, struct fuse_file *ff)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_setattr_in inarg;
struct fuse_attr_out outarg;
@@ -1465,7 +1542,7 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff)
inarg.valid = FATTR_MTIME;
inarg.mtime = inode->i_mtime.tv_sec;
inarg.mtimensec = inode->i_mtime.tv_nsec;
- if (fc->minor >= 23) {
+ if (fm->fc->minor >= 23) {
inarg.valid |= FATTR_CTIME;
inarg.ctime = inode->i_ctime.tv_sec;
inarg.ctimensec = inode->i_ctime.tv_nsec;
@@ -1474,9 +1551,9 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff)
inarg.valid |= FATTR_FH;
inarg.fh = ff->fh;
}
- fuse_setattr_fill(fc, &args, inode, &inarg, &outarg);
+ fuse_setattr_fill(fm->fc, &args, inode, &inarg, &outarg);
- return fuse_simple_request(fc, &args);
+ return fuse_simple_request(fm, &args);
}
/*
@@ -1491,7 +1568,8 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
struct file *file)
{
struct inode *inode = d_inode(dentry);
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ struct fuse_conn *fc = fm->fc;
struct fuse_inode *fi = get_fuse_inode(inode);
FUSE_ARGS(args);
struct fuse_setattr_in inarg;
@@ -1501,6 +1579,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
loff_t oldsize;
int err;
bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode);
+ bool fault_blocked = false;
if (!fc->default_permissions)
attr->ia_valid |= ATTR_FORCE;
@@ -1509,6 +1588,22 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
if (err)
return err;
+ if (attr->ia_valid & ATTR_SIZE) {
+ if (WARN_ON(!S_ISREG(inode->i_mode)))
+ return -EIO;
+ is_truncate = true;
+ }
+
+ if (FUSE_IS_DAX(inode) && is_truncate) {
+ down_write(&fi->i_mmap_sem);
+ fault_blocked = true;
+ err = fuse_dax_break_layouts(inode, 0, 0);
+ if (err) {
+ up_write(&fi->i_mmap_sem);
+ return err;
+ }
+ }
+
if (attr->ia_valid & ATTR_OPEN) {
/* This is coming from open(..., ... | O_TRUNC); */
WARN_ON(!(attr->ia_valid & ATTR_SIZE));
@@ -1521,17 +1616,11 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
*/
i_size_write(inode, 0);
truncate_pagecache(inode, 0);
- return 0;
+ goto out;
}
file = NULL;
}
- if (attr->ia_valid & ATTR_SIZE) {
- if (WARN_ON(!S_ISREG(inode->i_mode)))
- return -EIO;
- is_truncate = true;
- }
-
/* Flush dirty data/metadata before non-truncate SETATTR */
if (is_wb && S_ISREG(inode->i_mode) &&
attr->ia_valid &
@@ -1566,7 +1655,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
inarg.lock_owner = fuse_lock_owner_id(fc, current->files);
}
fuse_setattr_fill(fc, &args, inode, &inarg, &outarg);
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err) {
if (err == -EINTR)
fuse_invalidate_attr(inode);
@@ -1614,6 +1703,10 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
}
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+out:
+ if (fault_blocked)
+ up_write(&fi->i_mmap_sem);
+
return 0;
error:
@@ -1621,6 +1714,9 @@ error:
fuse_release_nowrite(inode);
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+
+ if (fault_blocked)
+ up_write(&fi->i_mmap_sem);
return err;
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 43c165e796da..c03034e8c152 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -32,7 +32,7 @@ static struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags,
return pages;
}
-static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
+static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
int opcode, struct fuse_open_out *outargp)
{
struct fuse_open_in inarg;
@@ -40,7 +40,7 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
memset(&inarg, 0, sizeof(inarg));
inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
- if (!fc->atomic_o_trunc)
+ if (!fm->fc->atomic_o_trunc)
inarg.flags &= ~O_TRUNC;
args.opcode = opcode;
args.nodeid = nodeid;
@@ -51,7 +51,7 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
args.out_args[0].size = sizeof(*outargp);
args.out_args[0].value = outargp;
- return fuse_simple_request(fc, &args);
+ return fuse_simple_request(fm, &args);
}
struct fuse_release_args {
@@ -60,7 +60,7 @@ struct fuse_release_args {
struct inode *inode;
};
-struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
+struct fuse_file *fuse_file_alloc(struct fuse_mount *fm)
{
struct fuse_file *ff;
@@ -68,7 +68,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
if (unlikely(!ff))
return NULL;
- ff->fc = fc;
+ ff->fm = fm;
ff->release_args = kzalloc(sizeof(*ff->release_args),
GFP_KERNEL_ACCOUNT);
if (!ff->release_args) {
@@ -82,7 +82,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
RB_CLEAR_NODE(&ff->polled_node);
init_waitqueue_head(&ff->poll_wait);
- ff->kh = atomic64_inc_return(&fc->khctr);
+ ff->kh = atomic64_inc_return(&fm->fc->khctr);
return ff;
}
@@ -100,7 +100,7 @@ static struct fuse_file *fuse_file_get(struct fuse_file *ff)
return ff;
}
-static void fuse_release_end(struct fuse_conn *fc, struct fuse_args *args,
+static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args,
int error)
{
struct fuse_release_args *ra = container_of(args, typeof(*ra), args);
@@ -114,29 +114,30 @@ static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir)
if (refcount_dec_and_test(&ff->count)) {
struct fuse_args *args = &ff->release_args->args;
- if (isdir ? ff->fc->no_opendir : ff->fc->no_open) {
+ if (isdir ? ff->fm->fc->no_opendir : ff->fm->fc->no_open) {
/* Do nothing when client does not implement 'open' */
- fuse_release_end(ff->fc, args, 0);
+ fuse_release_end(ff->fm, args, 0);
} else if (sync) {
- fuse_simple_request(ff->fc, args);
- fuse_release_end(ff->fc, args, 0);
+ fuse_simple_request(ff->fm, args);
+ fuse_release_end(ff->fm, args, 0);
} else {
args->end = fuse_release_end;
- if (fuse_simple_background(ff->fc, args,
+ if (fuse_simple_background(ff->fm, args,
GFP_KERNEL | __GFP_NOFAIL))
- fuse_release_end(ff->fc, args, -ENOTCONN);
+ fuse_release_end(ff->fm, args, -ENOTCONN);
}
kfree(ff);
}
}
-int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
+int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
bool isdir)
{
+ struct fuse_conn *fc = fm->fc;
struct fuse_file *ff;
int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
- ff = fuse_file_alloc(fc);
+ ff = fuse_file_alloc(fm);
if (!ff)
return -ENOMEM;
@@ -147,7 +148,7 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
struct fuse_open_out outarg;
int err;
- err = fuse_send_open(fc, nodeid, file, opcode, &outarg);
+ err = fuse_send_open(fm, nodeid, file, opcode, &outarg);
if (!err) {
ff->fh = outarg.fh;
ff->open_flags = outarg.open_flags;
@@ -216,27 +217,40 @@ void fuse_finish_open(struct inode *inode, struct file *file)
int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ struct fuse_conn *fc = fm->fc;
int err;
bool is_wb_truncate = (file->f_flags & O_TRUNC) &&
fc->atomic_o_trunc &&
fc->writeback_cache;
+ bool dax_truncate = (file->f_flags & O_TRUNC) &&
+ fc->atomic_o_trunc && FUSE_IS_DAX(inode);
err = generic_file_open(inode, file);
if (err)
return err;
- if (is_wb_truncate) {
+ if (is_wb_truncate || dax_truncate) {
inode_lock(inode);
fuse_set_nowrite(inode);
}
- err = fuse_do_open(fc, get_node_id(inode), file, isdir);
+ if (dax_truncate) {
+ down_write(&get_fuse_inode(inode)->i_mmap_sem);
+ err = fuse_dax_break_layouts(inode, 0, 0);
+ if (err)
+ goto out;
+ }
+ err = fuse_do_open(fm, get_node_id(inode), file, isdir);
if (!err)
fuse_finish_open(inode, file);
- if (is_wb_truncate) {
+out:
+ if (dax_truncate)
+ up_write(&get_fuse_inode(inode)->i_mmap_sem);
+
+ if (is_wb_truncate | dax_truncate) {
fuse_release_nowrite(inode);
inode_unlock(inode);
}
@@ -247,7 +261,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
int flags, int opcode)
{
- struct fuse_conn *fc = ff->fc;
+ struct fuse_conn *fc = ff->fm->fc;
struct fuse_release_args *ra = ff->release_args;
/* Inode is NULL on error path of fuse_create_open() */
@@ -285,7 +299,7 @@ void fuse_release_common(struct file *file, bool isdir)
if (ff->flock) {
ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
- ra->inarg.lock_owner = fuse_lock_owner_id(ff->fc,
+ ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc,
(fl_owner_t) file);
}
/* Hold inode until release is finished */
@@ -300,7 +314,7 @@ void fuse_release_common(struct file *file, bool isdir)
* synchronous RELEASE is allowed (and desirable) in this case
* because the server can be trusted not to screw up.
*/
- fuse_file_put(ff, ff->fc->destroy, isdir);
+ fuse_file_put(ff, ff->fm->fc->destroy, isdir);
}
static int fuse_open(struct inode *inode, struct file *file)
@@ -443,7 +457,7 @@ static void fuse_sync_writes(struct inode *inode)
static int fuse_flush(struct file *file, fl_owner_t id)
{
struct inode *inode = file_inode(file);
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_file *ff = file->private_data;
struct fuse_flush_in inarg;
FUSE_ARGS(args);
@@ -465,12 +479,12 @@ static int fuse_flush(struct file *file, fl_owner_t id)
return err;
err = 0;
- if (fc->no_flush)
+ if (fm->fc->no_flush)
goto inval_attr_out;
memset(&inarg, 0, sizeof(inarg));
inarg.fh = ff->fh;
- inarg.lock_owner = fuse_lock_owner_id(fc, id);
+ inarg.lock_owner = fuse_lock_owner_id(fm->fc, id);
args.opcode = FUSE_FLUSH;
args.nodeid = get_node_id(inode);
args.in_numargs = 1;
@@ -478,9 +492,9 @@ static int fuse_flush(struct file *file, fl_owner_t id)
args.in_args[0].value = &inarg;
args.force = true;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
- fc->no_flush = 1;
+ fm->fc->no_flush = 1;
err = 0;
}
@@ -489,7 +503,7 @@ inval_attr_out:
* In memory i_blocks is not maintained by fuse, if writeback cache is
* enabled, i_blocks from cached attr may not be accurate.
*/
- if (!err && fc->writeback_cache)
+ if (!err && fm->fc->writeback_cache)
fuse_invalidate_attr(inode);
return err;
}
@@ -498,7 +512,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
int datasync, int opcode)
{
struct inode *inode = file->f_mapping->host;
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_file *ff = file->private_data;
FUSE_ARGS(args);
struct fuse_fsync_in inarg;
@@ -511,7 +525,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
args.in_numargs = 1;
args.in_args[0].size = sizeof(inarg);
args.in_args[0].value = &inarg;
- return fuse_simple_request(fc, &args);
+ return fuse_simple_request(fm, &args);
}
static int fuse_fsync(struct file *file, loff_t start, loff_t end,
@@ -686,7 +700,7 @@ static void fuse_io_free(struct fuse_io_args *ia)
kfree(ia);
}
-static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_args *args,
+static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args,
int err)
{
struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
@@ -715,7 +729,7 @@ static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_args *args,
fuse_io_free(ia);
}
-static ssize_t fuse_async_req_send(struct fuse_conn *fc,
+static ssize_t fuse_async_req_send(struct fuse_mount *fm,
struct fuse_io_args *ia, size_t num_bytes)
{
ssize_t err;
@@ -729,9 +743,9 @@ static ssize_t fuse_async_req_send(struct fuse_conn *fc,
ia->ap.args.end = fuse_aio_complete_req;
ia->ap.args.may_block = io->should_dirty;
- err = fuse_simple_background(fc, &ia->ap.args, GFP_KERNEL);
+ err = fuse_simple_background(fm, &ia->ap.args, GFP_KERNEL);
if (err)
- fuse_aio_complete_req(fc, &ia->ap.args, err);
+ fuse_aio_complete_req(fm, &ia->ap.args, err);
return num_bytes;
}
@@ -741,18 +755,18 @@ static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count,
{
struct file *file = ia->io->iocb->ki_filp;
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = ff->fc;
+ struct fuse_mount *fm = ff->fm;
fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
if (owner != NULL) {
ia->read.in.read_flags |= FUSE_READ_LOCKOWNER;
- ia->read.in.lock_owner = fuse_lock_owner_id(fc, owner);
+ ia->read.in.lock_owner = fuse_lock_owner_id(fm->fc, owner);
}
if (ia->io->async)
- return fuse_async_req_send(fc, ia, count);
+ return fuse_async_req_send(fm, ia, count);
- return fuse_simple_request(fc, &ia->ap.args);
+ return fuse_simple_request(fm, &ia->ap.args);
}
static void fuse_read_update_size(struct inode *inode, loff_t size,
@@ -798,7 +812,7 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read,
static int fuse_do_readpage(struct file *file, struct page *page)
{
struct inode *inode = page->mapping->host;
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
loff_t pos = page_offset(page);
struct fuse_page_desc desc = { .length = PAGE_SIZE };
struct fuse_io_args ia = {
@@ -818,14 +832,14 @@ static int fuse_do_readpage(struct file *file, struct page *page)
*/
fuse_wait_on_page_writeback(inode, page->index);
- attr_ver = fuse_get_attr_version(fc);
+ attr_ver = fuse_get_attr_version(fm->fc);
/* Don't overflow end offset */
if (pos + (desc.length - 1) == LLONG_MAX)
desc.length--;
fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ);
- res = fuse_simple_request(fc, &ia.ap.args);
+ res = fuse_simple_request(fm, &ia.ap.args);
if (res < 0)
return res;
/*
@@ -855,7 +869,7 @@ static int fuse_readpage(struct file *file, struct page *page)
return err;
}
-static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_args *args,
+static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
int err)
{
int i;
@@ -899,7 +913,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_args *args,
static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
{
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = ff->fc;
+ struct fuse_mount *fm = ff->fm;
struct fuse_args_pages *ap = &ia->ap;
loff_t pos = page_offset(ap->pages[0]);
size_t count = ap->num_pages << PAGE_SHIFT;
@@ -918,18 +932,18 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
WARN_ON((loff_t) (pos + count) < 0);
fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
- ia->read.attr_ver = fuse_get_attr_version(fc);
- if (fc->async_read) {
+ ia->read.attr_ver = fuse_get_attr_version(fm->fc);
+ if (fm->fc->async_read) {
ia->ff = fuse_file_get(ff);
ap->args.end = fuse_readpages_end;
- err = fuse_simple_background(fc, &ap->args, GFP_KERNEL);
+ err = fuse_simple_background(fm, &ap->args, GFP_KERNEL);
if (!err)
return;
} else {
- res = fuse_simple_request(fc, &ap->args);
+ res = fuse_simple_request(fm, &ap->args);
err = res < 0 ? res : 0;
}
- fuse_readpages_end(fc, &ap->args, err);
+ fuse_readpages_end(fm, &ap->args, err);
}
static void fuse_readahead(struct readahead_control *rac)
@@ -1000,7 +1014,7 @@ static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff,
args->opcode = FUSE_WRITE;
args->nodeid = ff->nodeid;
args->in_numargs = 2;
- if (ff->fc->minor < 9)
+ if (ff->fm->fc->minor < 9)
args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
else
args->in_args[0].size = sizeof(ia->write.in);
@@ -1029,7 +1043,7 @@ static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos,
struct kiocb *iocb = ia->io->iocb;
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = ff->fc;
+ struct fuse_mount *fm = ff->fm;
struct fuse_write_in *inarg = &ia->write.in;
ssize_t err;
@@ -1037,13 +1051,13 @@ static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos,
inarg->flags = fuse_write_flags(iocb);
if (owner != NULL) {
inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
- inarg->lock_owner = fuse_lock_owner_id(fc, owner);
+ inarg->lock_owner = fuse_lock_owner_id(fm->fc, owner);
}
if (ia->io->async)
- return fuse_async_req_send(fc, ia, count);
+ return fuse_async_req_send(fm, ia, count);
- err = fuse_simple_request(fc, &ia->ap.args);
+ err = fuse_simple_request(fm, &ia->ap.args);
if (!err && ia->write.out.size > count)
err = -EIO;
@@ -1074,7 +1088,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
struct fuse_args_pages *ap = &ia->ap;
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = ff->fc;
+ struct fuse_mount *fm = ff->fm;
unsigned int offset, i;
int err;
@@ -1084,7 +1098,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
fuse_write_args_fill(ia, ff, pos, count);
ia->write.in.flags = fuse_write_flags(iocb);
- err = fuse_simple_request(fc, &ap->args);
+ err = fuse_simple_request(fm, &ap->args);
if (!err && ia->write.out.size > count)
err = -EIO;
@@ -1399,7 +1413,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
struct file *file = io->iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = ff->fc;
+ struct fuse_conn *fc = ff->fm->fc;
size_t nmax = write ? fc->max_write : fc->max_read;
loff_t pos = *ppos;
size_t count = iov_iter_count(iter);
@@ -1539,10 +1553,14 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
+ struct inode *inode = file_inode(file);
- if (is_bad_inode(file_inode(file)))
+ if (is_bad_inode(inode))
return -EIO;
+ if (FUSE_IS_DAX(inode))
+ return fuse_dax_read_iter(iocb, to);
+
if (!(ff->open_flags & FOPEN_DIRECT_IO))
return fuse_cache_read_iter(iocb, to);
else
@@ -1553,10 +1571,14 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
+ struct inode *inode = file_inode(file);
- if (is_bad_inode(file_inode(file)))
+ if (is_bad_inode(inode))
return -EIO;
+ if (FUSE_IS_DAX(inode))
+ return fuse_dax_write_iter(iocb, from);
+
if (!(ff->open_flags & FOPEN_DIRECT_IO))
return fuse_cache_write_iter(iocb, from);
else
@@ -1578,7 +1600,7 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa)
kfree(wpa);
}
-static void fuse_writepage_finish(struct fuse_conn *fc,
+static void fuse_writepage_finish(struct fuse_mount *fm,
struct fuse_writepage_args *wpa)
{
struct fuse_args_pages *ap = &wpa->ia.ap;
@@ -1596,7 +1618,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc,
}
/* Called under fi->lock, may release and reacquire it */
-static void fuse_send_writepage(struct fuse_conn *fc,
+static void fuse_send_writepage(struct fuse_mount *fm,
struct fuse_writepage_args *wpa, loff_t size)
__releases(fi->lock)
__acquires(fi->lock)
@@ -1622,10 +1644,10 @@ __acquires(fi->lock)
args->force = true;
args->nocreds = true;
- err = fuse_simple_background(fc, args, GFP_ATOMIC);
+ err = fuse_simple_background(fm, args, GFP_ATOMIC);
if (err == -ENOMEM) {
spin_unlock(&fi->lock);
- err = fuse_simple_background(fc, args, GFP_NOFS | __GFP_NOFAIL);
+ err = fuse_simple_background(fm, args, GFP_NOFS | __GFP_NOFAIL);
spin_lock(&fi->lock);
}
@@ -1638,7 +1660,7 @@ __acquires(fi->lock)
out_free:
fi->writectr--;
rb_erase(&wpa->writepages_entry, &fi->writepages);
- fuse_writepage_finish(fc, wpa);
+ fuse_writepage_finish(fm, wpa);
spin_unlock(&fi->lock);
/* After fuse_writepage_finish() aux request list is private */
@@ -1662,7 +1684,7 @@ void fuse_flush_writepages(struct inode *inode)
__releases(fi->lock)
__acquires(fi->lock)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
loff_t crop = i_size_read(inode);
struct fuse_writepage_args *wpa;
@@ -1671,7 +1693,7 @@ __acquires(fi->lock)
wpa = list_entry(fi->queued_writes.next,
struct fuse_writepage_args, queue_entry);
list_del_init(&wpa->queue_entry);
- fuse_send_writepage(fc, wpa, crop);
+ fuse_send_writepage(fm, wpa, crop);
}
}
@@ -1712,7 +1734,7 @@ static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa)
WARN_ON(fuse_insert_writeback(root, wpa));
}
-static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args,
+static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
int error)
{
struct fuse_writepage_args *wpa =
@@ -1724,7 +1746,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args,
spin_lock(&fi->lock);
rb_erase(&wpa->writepages_entry, &fi->writepages);
while (wpa->next) {
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_write_in *inarg = &wpa->ia.write.in;
struct fuse_writepage_args *next = wpa->next;
@@ -1756,10 +1778,10 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args,
* no invocations of fuse_writepage_end() while we're in
* fuse_set_nowrite..fuse_release_nowrite section.
*/
- fuse_send_writepage(fc, next, inarg->offset + inarg->size);
+ fuse_send_writepage(fm, next, inarg->offset + inarg->size);
}
fi->writectr--;
- fuse_writepage_finish(fc, wpa);
+ fuse_writepage_finish(fm, wpa);
spin_unlock(&fi->lock);
fuse_writepage_free(wpa);
}
@@ -2317,6 +2339,10 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct fuse_file *ff = file->private_data;
+ /* DAX mmap is superior to direct_io mmap */
+ if (FUSE_IS_DAX(file_inode(file)))
+ return fuse_dax_mmap(file, vma);
+
if (ff->open_flags & FOPEN_DIRECT_IO) {
/* Can't provide the coherency needed for MAP_SHARED */
if (vma->vm_flags & VM_MAYSHARE)
@@ -2395,7 +2421,7 @@ static void fuse_lk_fill(struct fuse_args *args, struct file *file,
static int fuse_getlk(struct file *file, struct file_lock *fl)
{
struct inode *inode = file_inode(file);
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_lk_in inarg;
struct fuse_lk_out outarg;
@@ -2405,9 +2431,9 @@ static int fuse_getlk(struct file *file, struct file_lock *fl)
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (!err)
- err = convert_fuse_file_lock(fc, &outarg.lk, fl);
+ err = convert_fuse_file_lock(fm->fc, &outarg.lk, fl);
return err;
}
@@ -2415,12 +2441,12 @@ static int fuse_getlk(struct file *file, struct file_lock *fl)
static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
{
struct inode *inode = file_inode(file);
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_lk_in inarg;
int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL;
- pid_t pid_nr = pid_nr_ns(pid, fc->pid_ns);
+ pid_t pid_nr = pid_nr_ns(pid, fm->fc->pid_ns);
int err;
if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
@@ -2433,7 +2459,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
return 0;
fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg);
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
/* locking is restartable */
if (err == -EINTR)
@@ -2487,13 +2513,13 @@ static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
{
struct inode *inode = mapping->host;
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_bmap_in inarg;
struct fuse_bmap_out outarg;
int err;
- if (!inode->i_sb->s_bdev || fc->no_bmap)
+ if (!inode->i_sb->s_bdev || fm->fc->no_bmap)
return 0;
memset(&inarg, 0, sizeof(inarg));
@@ -2507,9 +2533,9 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err == -ENOSYS)
- fc->no_bmap = 1;
+ fm->fc->no_bmap = 1;
return err ? 0 : outarg.block;
}
@@ -2517,7 +2543,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_file *ff = file->private_data;
FUSE_ARGS(args);
struct fuse_lseek_in inarg = {
@@ -2528,7 +2554,7 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
struct fuse_lseek_out outarg;
int err;
- if (fc->no_lseek)
+ if (fm->fc->no_lseek)
goto fallback;
args.opcode = FUSE_LSEEK;
@@ -2539,10 +2565,10 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err) {
if (err == -ENOSYS) {
- fc->no_lseek = 1;
+ fm->fc->no_lseek = 1;
goto fallback;
}
return err;
@@ -2728,7 +2754,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
unsigned int flags)
{
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = ff->fc;
+ struct fuse_mount *fm = ff->fm;
struct fuse_ioctl_in inarg = {
.fh = ff->fh,
.cmd = cmd,
@@ -2761,12 +2787,12 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
err = -ENOMEM;
- ap.pages = fuse_pages_alloc(fc->max_pages, GFP_KERNEL, &ap.descs);
+ ap.pages = fuse_pages_alloc(fm->fc->max_pages, GFP_KERNEL, &ap.descs);
iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
if (!ap.pages || !iov_page)
goto out;
- fuse_page_descs_length_init(ap.descs, 0, fc->max_pages);
+ fuse_page_descs_length_init(ap.descs, 0, fm->fc->max_pages);
/*
* If restricted, initialize IO parameters as encoded in @cmd.
@@ -2811,7 +2837,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
/* make sure there are enough buffer pages and init request with them */
err = -ENOMEM;
- if (max_pages > fc->max_pages)
+ if (max_pages > fm->fc->max_pages)
goto out;
while (ap.num_pages < max_pages) {
ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
@@ -2848,7 +2874,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
ap.args.out_pages = true;
ap.args.out_argvar = true;
- transferred = fuse_simple_request(fc, &ap.args);
+ transferred = fuse_simple_request(fm, &ap.args);
err = transferred;
if (transferred < 0)
goto out;
@@ -2876,7 +2902,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
goto out;
vaddr = kmap_atomic(ap.pages[0]);
- err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr,
+ err = fuse_copy_ioctl_iovec(fm->fc, iov_page, vaddr,
transferred, in_iovs + out_iovs,
(flags & FUSE_IOCTL_COMPAT) != 0);
kunmap_atomic(vaddr);
@@ -2886,11 +2912,11 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
in_iov = iov_page;
out_iov = in_iov + in_iovs;
- err = fuse_verify_ioctl_iov(fc, in_iov, in_iovs);
+ err = fuse_verify_ioctl_iov(fm->fc, in_iov, in_iovs);
if (err)
goto out;
- err = fuse_verify_ioctl_iov(fc, out_iov, out_iovs);
+ err = fuse_verify_ioctl_iov(fm->fc, out_iov, out_iovs);
if (err)
goto out;
@@ -3000,13 +3026,13 @@ static void fuse_register_polled_file(struct fuse_conn *fc,
__poll_t fuse_file_poll(struct file *file, poll_table *wait)
{
struct fuse_file *ff = file->private_data;
- struct fuse_conn *fc = ff->fc;
+ struct fuse_mount *fm = ff->fm;
struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
struct fuse_poll_out outarg;
FUSE_ARGS(args);
int err;
- if (fc->no_poll)
+ if (fm->fc->no_poll)
return DEFAULT_POLLMASK;
poll_wait(file, &ff->poll_wait, wait);
@@ -3018,7 +3044,7 @@ __poll_t fuse_file_poll(struct file *file, poll_table *wait)
*/
if (waitqueue_active(&ff->poll_wait)) {
inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
- fuse_register_polled_file(fc, ff);
+ fuse_register_polled_file(fm->fc, ff);
}
args.opcode = FUSE_POLL;
@@ -3029,12 +3055,12 @@ __poll_t fuse_file_poll(struct file *file, poll_table *wait)
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (!err)
return demangle_poll(outarg.revents);
if (err == -ENOSYS) {
- fc->no_poll = 1;
+ fm->fc->no_poll = 1;
return DEFAULT_POLLMASK;
}
return EPOLLERR;
@@ -3120,13 +3146,13 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
* By default, we want to optimize all I/Os with async request
* submission to the client filesystem if supported.
*/
- io->async = ff->fc->async_dio;
+ io->async = ff->fm->fc->async_dio;
io->iocb = iocb;
io->blocking = is_sync_kiocb(iocb);
/* optimization for short read */
if (io->async && !io->write && offset + count > i_size) {
- iov_iter_truncate(iter, fuse_round_up(ff->fc, i_size - offset));
+ iov_iter_truncate(iter, fuse_round_up(ff->fm->fc, i_size - offset));
shortened = count - iov_iter_count(iter);
count -= shortened;
}
@@ -3196,7 +3222,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
struct fuse_file *ff = file->private_data;
struct inode *inode = file_inode(file);
struct fuse_inode *fi = get_fuse_inode(inode);
- struct fuse_conn *fc = ff->fc;
+ struct fuse_mount *fm = ff->fm;
FUSE_ARGS(args);
struct fuse_fallocate_in inarg = {
.fh = ff->fh,
@@ -3208,14 +3234,23 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
(mode & FALLOC_FL_PUNCH_HOLE);
+ bool block_faults = FUSE_IS_DAX(inode) && lock_inode;
+
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
- if (fc->no_fallocate)
+ if (fm->fc->no_fallocate)
return -EOPNOTSUPP;
if (lock_inode) {
inode_lock(inode);
+ if (block_faults) {
+ down_write(&fi->i_mmap_sem);
+ err = fuse_dax_break_layouts(inode, 0, 0);
+ if (err)
+ goto out;
+ }
+
if (mode & FALLOC_FL_PUNCH_HOLE) {
loff_t endbyte = offset + length - 1;
@@ -3240,9 +3275,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
args.in_numargs = 1;
args.in_args[0].size = sizeof(inarg);
args.in_args[0].value = &inarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
- fc->no_fallocate = 1;
+ fm->fc->no_fallocate = 1;
err = -EOPNOTSUPP;
}
if (err)
@@ -3252,7 +3287,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
if (!(mode & FALLOC_FL_KEEP_SIZE)) {
bool changed = fuse_write_update_size(inode, offset + length);
- if (changed && fc->writeback_cache)
+ if (changed && fm->fc->writeback_cache)
file_update_time(file);
}
@@ -3265,6 +3300,9 @@ out:
if (!(mode & FALLOC_FL_KEEP_SIZE))
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+ if (block_faults)
+ up_write(&fi->i_mmap_sem);
+
if (lock_inode)
inode_unlock(inode);
@@ -3280,7 +3318,8 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
struct fuse_inode *fi_out = get_fuse_inode(inode_out);
- struct fuse_conn *fc = ff_in->fc;
+ struct fuse_mount *fm = ff_in->fm;
+ struct fuse_conn *fc = fm->fc;
FUSE_ARGS(args);
struct fuse_copy_file_range_in inarg = {
.fh_in = ff_in->fh,
@@ -3349,7 +3388,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
fc->no_copy_file_range = 1;
err = -EOPNOTSUPP;
@@ -3404,6 +3443,7 @@ static const struct file_operations fuse_file_operations = {
.release = fuse_release,
.fsync = fuse_fsync,
.lock = fuse_file_lock,
+ .get_unmapped_area = thp_get_unmapped_area,
.flock = fuse_file_flock,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
@@ -3439,4 +3479,7 @@ void fuse_init_file_inode(struct inode *inode)
fi->writectr = 0;
init_waitqueue_head(&fi->page_waitq);
fi->writepages = RB_ROOT;
+
+ if (IS_ENABLED(CONFIG_FUSE_DAX))
+ fuse_dax_inode_init(inode);
}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 740a8a7d7ae6..d51598017d13 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -148,6 +148,20 @@ struct fuse_inode {
/** Lock to protect write related fields */
spinlock_t lock;
+
+ /**
+ * Can't take inode lock in fault path (leads to circular dependency).
+ * Introduce another semaphore which can be taken in fault path and
+ * then other filesystem paths can take this to block faults.
+ */
+ struct rw_semaphore i_mmap_sem;
+
+#ifdef CONFIG_FUSE_DAX
+ /*
+ * Dax specific inode data
+ */
+ struct fuse_inode_dax *dax;
+#endif
};
/** FUSE inode state bits */
@@ -161,12 +175,13 @@ enum {
};
struct fuse_conn;
+struct fuse_mount;
struct fuse_release_args;
/** FUSE specific file data */
struct fuse_file {
/** Fuse connection for this file */
- struct fuse_conn *fc;
+ struct fuse_mount *fm;
/* Argument space reserved for release */
struct fuse_release_args *release_args;
@@ -252,7 +267,7 @@ struct fuse_args {
bool may_block:1;
struct fuse_in_arg in_args[3];
struct fuse_arg out_args[2];
- void (*end)(struct fuse_conn *fc, struct fuse_args *args, int error);
+ void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error);
};
struct fuse_args_pages {
@@ -360,6 +375,9 @@ struct fuse_req {
/** virtio-fs's physically contiguous buffer for in and out args */
void *argbuf;
#endif
+
+ /** fuse_mount this request belongs to */
+ struct fuse_mount *fm;
};
struct fuse_iqueue;
@@ -482,11 +500,15 @@ struct fuse_fs_context {
bool destroy:1;
bool no_control:1;
bool no_force_umount:1;
- bool no_mount_options:1;
+ bool legacy_opts_show:1;
+ bool dax:1;
unsigned int max_read;
unsigned int blksize;
const char *subtype;
+ /* DAX device, may be NULL */
+ struct dax_device *dax_dev;
+
/* fuse_dev pointer to fill in, should contain NULL on entry */
void **fudptr;
};
@@ -494,9 +516,9 @@ struct fuse_fs_context {
/**
* A Fuse connection.
*
- * This structure is created, when the filesystem is mounted, and is
- * destroyed, when the client device is closed and the filesystem is
- * unmounted.
+ * This structure is created, when the root filesystem is mounted, and
+ * is destroyed, when the client device is closed and the last
+ * fuse_mount is destroyed.
*/
struct fuse_conn {
/** Lock protecting accessess to members of this structure */
@@ -610,6 +632,9 @@ struct fuse_conn {
/** cache READLINK responses in page cache */
unsigned cache_symlinks:1;
+ /* show legacy mount options */
+ unsigned int legacy_opts_show:1;
+
/*
* The following bitfields are only for optimization purposes
* and hence races in setting them will not cause malfunction
@@ -717,8 +742,8 @@ struct fuse_conn {
/** Do not allow MNT_FORCE umount */
unsigned int no_force_umount:1;
- /* Do not show mount options */
- unsigned int no_mount_options:1;
+ /* Auto-mount submounts announced by the server */
+ unsigned int auto_submounts:1;
/** The number of requests waiting for completion */
atomic_t num_waiting;
@@ -726,10 +751,10 @@ struct fuse_conn {
/** Negotiated minor version */
unsigned minor;
- /** Entry on the fuse_conn_list */
+ /** Entry on the fuse_mount_list */
struct list_head entry;
- /** Device ID from super block */
+ /** Device ID from the root super block */
dev_t dev;
/** Dentries in the control filesystem */
@@ -747,24 +772,70 @@ struct fuse_conn {
/** Called on final put */
void (*release)(struct fuse_conn *);
- /** Super block for this connection. */
- struct super_block *sb;
-
- /** Read/write semaphore to hold when accessing sb. */
+ /**
+ * Read/write semaphore to hold when accessing the sb of any
+ * fuse_mount belonging to this connection
+ */
struct rw_semaphore killsb;
/** List of device instances belonging to this connection */
struct list_head devices;
+
+#ifdef CONFIG_FUSE_DAX
+ /* Dax specific conn data, non-NULL if DAX is enabled */
+ struct fuse_conn_dax *dax;
+#endif
+
+ /** List of filesystems using this connection */
+ struct list_head mounts;
};
-static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
+/*
+ * Represents a mounted filesystem, potentially a submount.
+ *
+ * This object allows sharing a fuse_conn between separate mounts to
+ * allow submounts with dedicated superblocks and thus separate device
+ * IDs.
+ */
+struct fuse_mount {
+ /* Underlying (potentially shared) connection to the FUSE server */
+ struct fuse_conn *fc;
+
+ /* Refcount */
+ refcount_t count;
+
+ /*
+ * Super block for this connection (fc->killsb must be held when
+ * accessing this).
+ */
+ struct super_block *sb;
+
+ /* Entry on fc->mounts */
+ struct list_head fc_entry;
+};
+
+static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb)
{
return sb->s_fs_info;
}
+static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
+{
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+
+ return fm ? fm->fc : NULL;
+}
+
+static inline struct fuse_mount *get_fuse_mount(struct inode *inode)
+{
+ return get_fuse_mount_super(inode->i_sb);
+}
+
static inline struct fuse_conn *get_fuse_conn(struct inode *inode)
{
- return get_fuse_conn_super(inode->i_sb);
+ struct fuse_mount *fm = get_fuse_mount(inode);
+
+ return fm ? fm->fc : NULL;
}
static inline struct fuse_inode *get_fuse_inode(struct inode *inode)
@@ -794,11 +865,6 @@ extern const struct dentry_operations fuse_dentry_operations;
extern const struct dentry_operations fuse_root_dentry_operations;
/**
- * Inode to nodeid comparison.
- */
-int fuse_inode_eq(struct inode *inode, void *_nodeidp);
-
-/**
* Get a filled in inode
*/
struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
@@ -848,7 +914,7 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos,
*/
int fuse_open_common(struct inode *inode, struct file *file, bool isdir);
-struct fuse_file *fuse_file_alloc(struct fuse_conn *fc);
+struct fuse_file *fuse_file_alloc(struct fuse_mount *fm);
void fuse_file_free(struct fuse_file *ff);
void fuse_finish_open(struct inode *inode, struct file *file);
@@ -916,14 +982,14 @@ void __exit fuse_ctl_cleanup(void);
/**
* Simple request sending that does request allocation and freeing
*/
-ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args);
-int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args,
+ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args);
+int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args,
gfp_t gfp_flags);
/**
* End a finished request
*/
-void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_end(struct fuse_req *req);
/* Abort all requests */
void fuse_abort_conn(struct fuse_conn *fc);
@@ -949,7 +1015,8 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
/**
* Initialize fuse_conn
*/
-void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns,
+void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
+ struct user_namespace *user_ns,
const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv);
/**
@@ -957,11 +1024,21 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns,
*/
void fuse_conn_put(struct fuse_conn *fc);
+/**
+ * Acquire reference to fuse_mount
+ */
+struct fuse_mount *fuse_mount_get(struct fuse_mount *fm);
+
+/**
+ * Release reference to fuse_mount
+ */
+void fuse_mount_put(struct fuse_mount *fm);
+
struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc);
struct fuse_dev *fuse_dev_alloc(void);
void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc);
void fuse_dev_free(struct fuse_dev *fud);
-void fuse_send_init(struct fuse_conn *fc);
+void fuse_send_init(struct fuse_mount *fm);
/**
* Fill in superblock and initialize fuse connection
@@ -970,12 +1047,26 @@ void fuse_send_init(struct fuse_conn *fc);
*/
int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx);
-/**
- * Disassociate fuse connection from superblock and kill the superblock
+/*
+ * Fill in superblock for submounts
+ * @sb: partially-initialized superblock to fill in
+ * @parent_fi: The fuse_inode of the parent filesystem where this submount is
+ * mounted
+ */
+int fuse_fill_super_submount(struct super_block *sb,
+ struct fuse_inode *parent_fi);
+
+/*
+ * Remove the mount from the connection
*
- * Calls kill_anon_super(), do not use with bdev mounts.
+ * Returns whether this was the last mount
*/
-void fuse_kill_sb_anon(struct super_block *sb);
+bool fuse_mount_remove(struct fuse_mount *fm);
+
+/*
+ * Shut down the connection (possibly sending DESTROY request).
+ */
+void fuse_conn_destroy(struct fuse_mount *fm);
/**
* Add connection to control filesystem
@@ -1011,9 +1102,19 @@ void fuse_set_nowrite(struct inode *inode);
void fuse_release_nowrite(struct inode *inode);
/**
+ * Scan all fuse_mounts belonging to fc to find the first where
+ * ilookup5() returns a result. Return that result and the
+ * respective fuse_mount in *fm (unless fm is NULL).
+ *
+ * The caller must hold fc->killsb.
+ */
+struct inode *fuse_ilookup(struct fuse_conn *fc, u64 nodeid,
+ struct fuse_mount **fm);
+
+/**
* File-system tells the kernel to invalidate cache for the given node id.
*/
-int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
+int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid,
loff_t offset, loff_t len);
/**
@@ -1026,10 +1127,10 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
* - is a file or oan empty directory
* then the dentry is unhashed (d_delete()).
*/
-int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
+int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
u64 child_nodeid, struct qstr *name);
-int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
+int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
bool isdir);
/**
@@ -1093,4 +1194,20 @@ unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args);
u64 fuse_get_unique(struct fuse_iqueue *fiq);
void fuse_free_conn(struct fuse_conn *fc);
+/* dax.c */
+
+#define FUSE_IS_DAX(inode) (IS_ENABLED(CONFIG_FUSE_DAX) && IS_DAX(inode))
+
+ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to);
+ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from);
+int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma);
+int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start, u64 dmap_end);
+int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev);
+void fuse_dax_conn_free(struct fuse_conn *fc);
+bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi);
+void fuse_dax_inode_init(struct inode *inode);
+void fuse_dax_inode_cleanup(struct inode *inode);
+bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment);
+void fuse_dax_cancel_work(struct fuse_conn *fc);
+
#endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 581329203d68..1a47afc95f80 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -85,14 +85,22 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
fi->orig_ino = 0;
fi->state = 0;
mutex_init(&fi->mutex);
+ init_rwsem(&fi->i_mmap_sem);
spin_lock_init(&fi->lock);
fi->forget = fuse_alloc_forget();
- if (!fi->forget) {
- kmem_cache_free(fuse_inode_cachep, fi);
- return NULL;
- }
+ if (!fi->forget)
+ goto out_free;
+
+ if (IS_ENABLED(CONFIG_FUSE_DAX) && !fuse_dax_inode_alloc(sb, fi))
+ goto out_free_forget;
return &fi->inode;
+
+out_free_forget:
+ kfree(fi->forget);
+out_free:
+ kmem_cache_free(fuse_inode_cachep, fi);
+ return NULL;
}
static void fuse_free_inode(struct inode *inode)
@@ -101,6 +109,9 @@ static void fuse_free_inode(struct inode *inode)
mutex_destroy(&fi->mutex);
kfree(fi->forget);
+#ifdef CONFIG_FUSE_DAX
+ kfree(fi->dax);
+#endif
kmem_cache_free(fuse_inode_cachep, fi);
}
@@ -112,8 +123,14 @@ static void fuse_evict_inode(struct inode *inode)
clear_inode(inode);
if (inode->i_sb->s_flags & SB_ACTIVE) {
struct fuse_conn *fc = get_fuse_conn(inode);
- fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup);
- fi->forget = NULL;
+
+ if (FUSE_IS_DAX(inode))
+ fuse_dax_inode_cleanup(inode);
+ if (fi->nlookup) {
+ fuse_queue_forget(fc, fi->forget, fi->nodeid,
+ fi->nlookup);
+ fi->forget = NULL;
+ }
}
if (S_ISREG(inode->i_mode) && !is_bad_inode(inode)) {
WARN_ON(!list_empty(&fi->write_files));
@@ -268,7 +285,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
BUG();
}
-int fuse_inode_eq(struct inode *inode, void *_nodeidp)
+static int fuse_inode_eq(struct inode *inode, void *_nodeidp)
{
u64 nodeid = *(u64 *) _nodeidp;
if (get_node_id(inode) == nodeid)
@@ -292,7 +309,26 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
struct fuse_inode *fi;
struct fuse_conn *fc = get_fuse_conn_super(sb);
- retry:
+ /*
+ * Auto mount points get their node id from the submount root, which is
+ * not a unique identifier within this filesystem.
+ *
+ * To avoid conflicts, do not place submount points into the inode hash
+ * table.
+ */
+ if (fc->auto_submounts && (attr->flags & FUSE_ATTR_SUBMOUNT) &&
+ S_ISDIR(attr->mode)) {
+ inode = new_inode(sb);
+ if (!inode)
+ return NULL;
+
+ fuse_init_inode(inode, attr);
+ get_fuse_inode(inode)->nodeid = nodeid;
+ inode->i_flags |= S_AUTOMOUNT;
+ goto done;
+ }
+
+retry:
inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &nodeid);
if (!inode)
return NULL;
@@ -310,7 +346,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
iput(inode);
goto retry;
}
-
+done:
fi = get_fuse_inode(inode);
spin_lock(&fi->lock);
fi->nlookup++;
@@ -320,16 +356,37 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
return inode;
}
-int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
+struct inode *fuse_ilookup(struct fuse_conn *fc, u64 nodeid,
+ struct fuse_mount **fm)
+{
+ struct fuse_mount *fm_iter;
+ struct inode *inode;
+
+ WARN_ON(!rwsem_is_locked(&fc->killsb));
+ list_for_each_entry(fm_iter, &fc->mounts, fc_entry) {
+ if (!fm_iter->sb)
+ continue;
+
+ inode = ilookup5(fm_iter->sb, nodeid, fuse_inode_eq, &nodeid);
+ if (inode) {
+ if (fm)
+ *fm = fm_iter;
+ return inode;
+ }
+ }
+
+ return NULL;
+}
+
+int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid,
loff_t offset, loff_t len)
{
- struct fuse_conn *fc = get_fuse_conn_super(sb);
struct fuse_inode *fi;
struct inode *inode;
pgoff_t pg_start;
pgoff_t pg_end;
- inode = ilookup5(sb, nodeid, fuse_inode_eq, &nodeid);
+ inode = fuse_ilookup(fc, nodeid, NULL);
if (!inode)
return -ENOENT;
@@ -379,28 +436,23 @@ static void fuse_umount_begin(struct super_block *sb)
fuse_abort_conn(fc);
}
-static void fuse_send_destroy(struct fuse_conn *fc)
+static void fuse_send_destroy(struct fuse_mount *fm)
{
- if (fc->conn_init) {
+ if (fm->fc->conn_init) {
FUSE_ARGS(args);
args.opcode = FUSE_DESTROY;
args.force = true;
args.nocreds = true;
- fuse_simple_request(fc, &args);
+ fuse_simple_request(fm, &args);
}
}
static void fuse_put_super(struct super_block *sb)
{
- struct fuse_conn *fc = get_fuse_conn_super(sb);
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
- mutex_lock(&fuse_mutex);
- list_del(&fc->entry);
- fuse_ctl_remove_conn(fc);
- mutex_unlock(&fuse_mutex);
-
- fuse_conn_put(fc);
+ fuse_mount_put(fm);
}
static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr)
@@ -420,12 +472,12 @@ static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr
static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
- struct fuse_conn *fc = get_fuse_conn_super(sb);
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
FUSE_ARGS(args);
struct fuse_statfs_out outarg;
int err;
- if (!fuse_allow_current_process(fc)) {
+ if (!fuse_allow_current_process(fm->fc)) {
buf->f_type = FUSE_SUPER_MAGIC;
return 0;
}
@@ -437,7 +489,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (!err)
convert_fuse_statfs(buf, &outarg.st);
return err;
@@ -573,19 +625,25 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
struct super_block *sb = root->d_sb;
struct fuse_conn *fc = get_fuse_conn_super(sb);
- if (fc->no_mount_options)
- return 0;
+ if (fc->legacy_opts_show) {
+ seq_printf(m, ",user_id=%u",
+ from_kuid_munged(fc->user_ns, fc->user_id));
+ seq_printf(m, ",group_id=%u",
+ from_kgid_munged(fc->user_ns, fc->group_id));
+ if (fc->default_permissions)
+ seq_puts(m, ",default_permissions");
+ if (fc->allow_other)
+ seq_puts(m, ",allow_other");
+ if (fc->max_read != ~0)
+ seq_printf(m, ",max_read=%u", fc->max_read);
+ if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE)
+ seq_printf(m, ",blksize=%lu", sb->s_blocksize);
+ }
+#ifdef CONFIG_FUSE_DAX
+ if (fc->dax)
+ seq_puts(m, ",dax");
+#endif
- seq_printf(m, ",user_id=%u", from_kuid_munged(fc->user_ns, fc->user_id));
- seq_printf(m, ",group_id=%u", from_kgid_munged(fc->user_ns, fc->group_id));
- if (fc->default_permissions)
- seq_puts(m, ",default_permissions");
- if (fc->allow_other)
- seq_puts(m, ",allow_other");
- if (fc->max_read != ~0)
- seq_printf(m, ",max_read=%u", fc->max_read);
- if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE)
- seq_printf(m, ",blksize=%lu", sb->s_blocksize);
return 0;
}
@@ -615,7 +673,8 @@ static void fuse_pqueue_init(struct fuse_pqueue *fpq)
fpq->connected = 1;
}
-void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns,
+void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
+ struct user_namespace *user_ns,
const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv)
{
memset(fc, 0, sizeof(*fc));
@@ -642,6 +701,11 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns,
fc->pid_ns = get_pid_ns(task_active_pid_ns(current));
fc->user_ns = get_user_ns(user_ns);
fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ;
+
+ INIT_LIST_HEAD(&fc->mounts);
+ list_add(&fm->fc_entry, &fc->mounts);
+ fm->fc = fc;
+ refcount_set(&fm->count, 1);
}
EXPORT_SYMBOL_GPL(fuse_conn_init);
@@ -650,6 +714,8 @@ void fuse_conn_put(struct fuse_conn *fc)
if (refcount_dec_and_test(&fc->count)) {
struct fuse_iqueue *fiq = &fc->iq;
+ if (IS_ENABLED(CONFIG_FUSE_DAX))
+ fuse_dax_conn_free(fc);
if (fiq->ops->release)
fiq->ops->release(fiq);
put_pid_ns(fc->pid_ns);
@@ -666,6 +732,23 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
}
EXPORT_SYMBOL_GPL(fuse_conn_get);
+void fuse_mount_put(struct fuse_mount *fm)
+{
+ if (refcount_dec_and_test(&fm->count)) {
+ if (fm->fc)
+ fuse_conn_put(fm->fc);
+ kfree(fm);
+ }
+}
+EXPORT_SYMBOL_GPL(fuse_mount_put);
+
+struct fuse_mount *fuse_mount_get(struct fuse_mount *fm)
+{
+ refcount_inc(&fm->count);
+ return fm;
+}
+EXPORT_SYMBOL_GPL(fuse_mount_get);
+
static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
{
struct fuse_attr attr;
@@ -895,14 +978,16 @@ struct fuse_init_args {
struct fuse_init_out out;
};
-static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args,
+static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
int error)
{
+ struct fuse_conn *fc = fm->fc;
struct fuse_init_args *ia = container_of(args, typeof(*ia), args);
struct fuse_init_out *arg = &ia->out;
+ bool ok = true;
if (error || arg->major != FUSE_KERNEL_VERSION)
- fc->conn_error = 1;
+ ok = false;
else {
unsigned long ra_pages;
@@ -950,11 +1035,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args,
if (arg->flags & FUSE_HANDLE_KILLPRIV)
fc->handle_killpriv = 1;
if (arg->time_gran && arg->time_gran <= 1000000000)
- fc->sb->s_time_gran = arg->time_gran;
+ fm->sb->s_time_gran = arg->time_gran;
if ((arg->flags & FUSE_POSIX_ACL)) {
fc->default_permissions = 1;
fc->posix_acl = 1;
- fc->sb->s_xattr = fuse_acl_xattr_handlers;
+ fm->sb->s_xattr = fuse_acl_xattr_handlers;
}
if (arg->flags & FUSE_CACHE_SYMLINKS)
fc->cache_symlinks = 1;
@@ -965,14 +1050,19 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args,
min_t(unsigned int, FUSE_MAX_MAX_PAGES,
max_t(unsigned int, arg->max_pages, 1));
}
+ if (IS_ENABLED(CONFIG_FUSE_DAX) &&
+ arg->flags & FUSE_MAP_ALIGNMENT &&
+ !fuse_dax_check_alignment(fc, arg->map_alignment)) {
+ ok = false;
+ }
} else {
ra_pages = fc->max_read / PAGE_SIZE;
fc->no_lock = 1;
fc->no_flock = 1;
}
- fc->sb->s_bdi->ra_pages =
- min(fc->sb->s_bdi->ra_pages, ra_pages);
+ fm->sb->s_bdi->ra_pages =
+ min(fm->sb->s_bdi->ra_pages, ra_pages);
fc->minor = arg->minor;
fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
fc->max_write = max_t(unsigned, 4096, fc->max_write);
@@ -980,11 +1070,16 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args,
}
kfree(ia);
+ if (!ok) {
+ fc->conn_init = 0;
+ fc->conn_error = 1;
+ }
+
fuse_set_initialized(fc);
wake_up_all(&fc->blocked_waitq);
}
-void fuse_send_init(struct fuse_conn *fc)
+void fuse_send_init(struct fuse_mount *fm)
{
struct fuse_init_args *ia;
@@ -992,7 +1087,7 @@ void fuse_send_init(struct fuse_conn *fc)
ia->in.major = FUSE_KERNEL_VERSION;
ia->in.minor = FUSE_KERNEL_MINOR_VERSION;
- ia->in.max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE;
+ ia->in.max_readahead = fm->sb->s_bdi->ra_pages * PAGE_SIZE;
ia->in.flags |=
FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
@@ -1003,6 +1098,13 @@ void fuse_send_init(struct fuse_conn *fc)
FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL |
FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS |
FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA;
+#ifdef CONFIG_FUSE_DAX
+ if (fm->fc->dax)
+ ia->in.flags |= FUSE_MAP_ALIGNMENT;
+#endif
+ if (fm->fc->auto_submounts)
+ ia->in.flags |= FUSE_SUBMOUNTS;
+
ia->args.opcode = FUSE_INIT;
ia->args.in_numargs = 1;
ia->args.in_args[0].size = sizeof(ia->in);
@@ -1018,8 +1120,8 @@ void fuse_send_init(struct fuse_conn *fc)
ia->args.nocreds = true;
ia->args.end = process_init_reply;
- if (fuse_simple_background(fc, &ia->args, GFP_KERNEL) != 0)
- process_init_reply(fc, &ia->args, -ENOTCONN);
+ if (fuse_simple_background(fm, &ia->args, GFP_KERNEL) != 0)
+ process_init_reply(fm, &ia->args, -ENOTCONN);
}
EXPORT_SYMBOL_GPL(fuse_send_init);
@@ -1130,10 +1232,92 @@ void fuse_dev_free(struct fuse_dev *fud)
}
EXPORT_SYMBOL_GPL(fuse_dev_free);
+static void fuse_fill_attr_from_inode(struct fuse_attr *attr,
+ const struct fuse_inode *fi)
+{
+ *attr = (struct fuse_attr){
+ .ino = fi->inode.i_ino,
+ .size = fi->inode.i_size,
+ .blocks = fi->inode.i_blocks,
+ .atime = fi->inode.i_atime.tv_sec,
+ .mtime = fi->inode.i_mtime.tv_sec,
+ .ctime = fi->inode.i_ctime.tv_sec,
+ .atimensec = fi->inode.i_atime.tv_nsec,
+ .mtimensec = fi->inode.i_mtime.tv_nsec,
+ .ctimensec = fi->inode.i_ctime.tv_nsec,
+ .mode = fi->inode.i_mode,
+ .nlink = fi->inode.i_nlink,
+ .uid = fi->inode.i_uid.val,
+ .gid = fi->inode.i_gid.val,
+ .rdev = fi->inode.i_rdev,
+ .blksize = 1u << fi->inode.i_blkbits,
+ };
+}
+
+static void fuse_sb_defaults(struct super_block *sb)
+{
+ sb->s_magic = FUSE_SUPER_MAGIC;
+ sb->s_op = &fuse_super_operations;
+ sb->s_xattr = fuse_xattr_handlers;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sb->s_time_gran = 1;
+ sb->s_export_op = &fuse_export_operations;
+ sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE;
+ if (sb->s_user_ns != &init_user_ns)
+ sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER;
+ sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION);
+
+ /*
+ * If we are not in the initial user namespace posix
+ * acls must be translated.
+ */
+ if (sb->s_user_ns != &init_user_ns)
+ sb->s_xattr = fuse_no_acl_xattr_handlers;
+}
+
+int fuse_fill_super_submount(struct super_block *sb,
+ struct fuse_inode *parent_fi)
+{
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ struct super_block *parent_sb = parent_fi->inode.i_sb;
+ struct fuse_attr root_attr;
+ struct inode *root;
+
+ fuse_sb_defaults(sb);
+ fm->sb = sb;
+
+ WARN_ON(sb->s_bdi != &noop_backing_dev_info);
+ sb->s_bdi = bdi_get(parent_sb->s_bdi);
+
+ sb->s_xattr = parent_sb->s_xattr;
+ sb->s_time_gran = parent_sb->s_time_gran;
+ sb->s_blocksize = parent_sb->s_blocksize;
+ sb->s_blocksize_bits = parent_sb->s_blocksize_bits;
+ sb->s_subtype = kstrdup(parent_sb->s_subtype, GFP_KERNEL);
+ if (parent_sb->s_subtype && !sb->s_subtype)
+ return -ENOMEM;
+
+ fuse_fill_attr_from_inode(&root_attr, parent_fi);
+ root = fuse_iget(sb, parent_fi->nodeid, 0, &root_attr, 0, 0);
+ /*
+ * This inode is just a duplicate, so it is not looked up and
+ * its nlookup should not be incremented. fuse_iget() does
+ * that, though, so undo it here.
+ */
+ get_fuse_inode(root)->nlookup--;
+ sb->s_d_op = &fuse_dentry_operations;
+ sb->s_root = d_make_root(root);
+ if (!sb->s_root)
+ return -ENOMEM;
+
+ return 0;
+}
+
int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
{
struct fuse_dev *fud = NULL;
- struct fuse_conn *fc = get_fuse_conn_super(sb);
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ struct fuse_conn *fc = fm->fc;
struct inode *root;
struct dentry *root_dentry;
int err;
@@ -1142,7 +1326,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
if (sb->s_flags & SB_MANDLOCK)
goto err;
- sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION);
+ fuse_sb_defaults(sb);
if (ctx->is_bdev) {
#ifdef CONFIG_BLOCK
@@ -1157,32 +1341,21 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
sb->s_subtype = ctx->subtype;
ctx->subtype = NULL;
- sb->s_magic = FUSE_SUPER_MAGIC;
- sb->s_op = &fuse_super_operations;
- sb->s_xattr = fuse_xattr_handlers;
- sb->s_maxbytes = MAX_LFS_FILESIZE;
- sb->s_time_gran = 1;
- sb->s_export_op = &fuse_export_operations;
- sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE;
- if (sb->s_user_ns != &init_user_ns)
- sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER;
-
- /*
- * If we are not in the initial user namespace posix
- * acls must be translated.
- */
- if (sb->s_user_ns != &init_user_ns)
- sb->s_xattr = fuse_no_acl_xattr_handlers;
+ if (IS_ENABLED(CONFIG_FUSE_DAX)) {
+ err = fuse_dax_conn_alloc(fc, ctx->dax_dev);
+ if (err)
+ goto err;
+ }
if (ctx->fudptr) {
err = -ENOMEM;
fud = fuse_dev_alloc_install(fc);
if (!fud)
- goto err;
+ goto err_free_dax;
}
fc->dev = sb->s_dev;
- fc->sb = sb;
+ fm->sb = sb;
err = fuse_bdi_init(fc, sb);
if (err)
goto err_dev_free;
@@ -1196,11 +1369,11 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
fc->allow_other = ctx->allow_other;
fc->user_id = ctx->user_id;
fc->group_id = ctx->group_id;
- fc->max_read = max_t(unsigned, 4096, ctx->max_read);
+ fc->legacy_opts_show = ctx->legacy_opts_show;
+ fc->max_read = max_t(unsigned int, 4096, ctx->max_read);
fc->destroy = ctx->destroy;
fc->no_control = ctx->no_control;
fc->no_force_umount = ctx->no_force_umount;
- fc->no_mount_options = ctx->no_mount_options;
err = -ENOMEM;
root = fuse_get_root_inode(sb, ctx->rootmode);
@@ -1233,6 +1406,9 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
err_dev_free:
if (fud)
fuse_dev_free(fud);
+ err_free_dax:
+ if (IS_ENABLED(CONFIG_FUSE_DAX))
+ fuse_dax_conn_free(fc);
err:
return err;
}
@@ -1244,6 +1420,7 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
struct file *file;
int err;
struct fuse_conn *fc;
+ struct fuse_mount *fm;
err = -EINVAL;
file = fget(ctx->fd);
@@ -1264,9 +1441,16 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
if (!fc)
goto err_fput;
- fuse_conn_init(fc, sb->s_user_ns, &fuse_dev_fiq_ops, NULL);
+ fm = kzalloc(sizeof(*fm), GFP_KERNEL);
+ if (!fm) {
+ kfree(fc);
+ goto err_fput;
+ }
+
+ fuse_conn_init(fc, fm, sb->s_user_ns, &fuse_dev_fiq_ops, NULL);
fc->release = fuse_free_conn;
- sb->s_fs_info = fc;
+
+ sb->s_fs_info = fm;
err = fuse_fill_super_common(sb, ctx);
if (err)
@@ -1277,11 +1461,11 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
* CPUs after this
*/
fput(file);
- fuse_send_init(get_fuse_conn_super(sb));
+ fuse_send_init(get_fuse_mount_super(sb));
return 0;
err_put_conn:
- fuse_conn_put(fc);
+ fuse_mount_put(fm);
sb->s_fs_info = NULL;
err_fput:
fput(file);
@@ -1325,6 +1509,7 @@ static int fuse_init_fs_context(struct fs_context *fc)
ctx->max_read = ~0;
ctx->blksize = FUSE_DEFAULT_BLKSIZE;
+ ctx->legacy_opts_show = true;
#ifdef CONFIG_BLOCK
if (fc->fs_type == &fuseblk_fs_type) {
@@ -1338,29 +1523,52 @@ static int fuse_init_fs_context(struct fs_context *fc)
return 0;
}
-static void fuse_sb_destroy(struct super_block *sb)
+bool fuse_mount_remove(struct fuse_mount *fm)
{
- struct fuse_conn *fc = get_fuse_conn_super(sb);
+ struct fuse_conn *fc = fm->fc;
+ bool last = false;
- if (fc) {
- if (fc->destroy)
- fuse_send_destroy(fc);
+ down_write(&fc->killsb);
+ list_del_init(&fm->fc_entry);
+ if (list_empty(&fc->mounts))
+ last = true;
+ up_write(&fc->killsb);
- fuse_abort_conn(fc);
- fuse_wait_aborted(fc);
+ return last;
+}
+EXPORT_SYMBOL_GPL(fuse_mount_remove);
- down_write(&fc->killsb);
- fc->sb = NULL;
- up_write(&fc->killsb);
+void fuse_conn_destroy(struct fuse_mount *fm)
+{
+ struct fuse_conn *fc = fm->fc;
+
+ if (fc->destroy)
+ fuse_send_destroy(fm);
+
+ fuse_abort_conn(fc);
+ fuse_wait_aborted(fc);
+
+ if (!list_empty(&fc->entry)) {
+ mutex_lock(&fuse_mutex);
+ list_del(&fc->entry);
+ fuse_ctl_remove_conn(fc);
+ mutex_unlock(&fuse_mutex);
}
}
+EXPORT_SYMBOL_GPL(fuse_conn_destroy);
-void fuse_kill_sb_anon(struct super_block *sb)
+static void fuse_kill_sb_anon(struct super_block *sb)
{
- fuse_sb_destroy(sb);
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ bool last;
+
+ if (fm) {
+ last = fuse_mount_remove(fm);
+ if (last)
+ fuse_conn_destroy(fm);
+ }
kill_anon_super(sb);
}
-EXPORT_SYMBOL_GPL(fuse_kill_sb_anon);
static struct file_system_type fuse_fs_type = {
.owner = THIS_MODULE,
@@ -1375,7 +1583,14 @@ MODULE_ALIAS_FS("fuse");
#ifdef CONFIG_BLOCK
static void fuse_kill_sb_blk(struct super_block *sb)
{
- fuse_sb_destroy(sb);
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ bool last;
+
+ if (fm) {
+ last = fuse_mount_remove(fm);
+ if (last)
+ fuse_conn_destroy(fm);
+ }
kill_block_super(sb);
}
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index 90e3f01bd796..3b5e91045871 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -252,7 +252,7 @@ retry:
static void fuse_force_forget(struct file *file, u64 nodeid)
{
struct inode *inode = file_inode(file);
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_forget_in inarg;
FUSE_ARGS(args);
@@ -266,7 +266,7 @@ static void fuse_force_forget(struct file *file, u64 nodeid)
args.force = true;
args.noreply = true;
- fuse_simple_request(fc, &args);
+ fuse_simple_request(fm, &args);
/* ignore errors */
}
@@ -320,7 +320,7 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
ssize_t res;
struct page *page;
struct inode *inode = file_inode(file);
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_io_args ia = {};
struct fuse_args_pages *ap = &ia.ap;
struct fuse_page_desc desc = { .length = PAGE_SIZE };
@@ -337,7 +337,7 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
ap->pages = &page;
ap->descs = &desc;
if (plus) {
- attr_version = fuse_get_attr_version(fc);
+ attr_version = fuse_get_attr_version(fm->fc);
fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
FUSE_READDIRPLUS);
} else {
@@ -345,7 +345,7 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
FUSE_READDIR);
}
locked = fuse_lock_inode(inode);
- res = fuse_simple_request(fc, &ap->args);
+ res = fuse_simple_request(fm, &ap->args);
fuse_unlock_inode(inode, locked);
if (res >= 0) {
if (!res) {
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 104f35de5270..21a9e534417c 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -5,12 +5,17 @@
*/
#include <linux/fs.h>
+#include <linux/dax.h>
+#include <linux/pci.h>
+#include <linux/pfn_t.h>
#include <linux/module.h>
#include <linux/virtio.h>
#include <linux/virtio_fs.h>
#include <linux/delay.h>
#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/highmem.h>
+#include <linux/uio.h>
#include "fuse_i.h"
/* List of virtio-fs device instances and a lock for the list. Also provides
@@ -24,6 +29,8 @@ enum {
VQ_REQUEST
};
+#define VQ_NAME_LEN 24
+
/* Per-virtqueue state */
struct virtio_fs_vq {
spinlock_t lock;
@@ -36,7 +43,7 @@ struct virtio_fs_vq {
bool connected;
long in_flight;
struct completion in_flight_zero; /* No inflight requests */
- char name[24];
+ char name[VQ_NAME_LEN];
} ____cacheline_aligned_in_smp;
/* A virtio-fs device instance */
@@ -47,6 +54,12 @@ struct virtio_fs {
struct virtio_fs_vq *vqs;
unsigned int nvqs; /* number of virtqueues */
unsigned int num_request_queues; /* number of request queues */
+ struct dax_device *dax_dev;
+
+ /* DAX memory window where file contents are mapped */
+ void *window_kaddr;
+ phys_addr_t window_phys_addr;
+ size_t window_len;
};
struct virtio_fs_forget_req {
@@ -69,6 +82,44 @@ struct virtio_fs_req_work {
static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
struct fuse_req *req, bool in_flight);
+enum {
+ OPT_DAX,
+};
+
+static const struct fs_parameter_spec virtio_fs_parameters[] = {
+ fsparam_flag("dax", OPT_DAX),
+ {}
+};
+
+static int virtio_fs_parse_param(struct fs_context *fc,
+ struct fs_parameter *param)
+{
+ struct fs_parse_result result;
+ struct fuse_fs_context *ctx = fc->fs_private;
+ int opt;
+
+ opt = fs_parse(fc, virtio_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case OPT_DAX:
+ ctx->dax = 1;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void virtio_fs_free_fc(struct fs_context *fc)
+{
+ struct fuse_fs_context *ctx = fc->fs_private;
+
+ kfree(ctx);
+}
+
static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq)
{
struct virtio_fs *fs = vq->vdev->priv;
@@ -289,7 +340,6 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work)
struct fuse_req *req;
struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
dispatch_work.work);
- struct fuse_conn *fc = fsvq->fud->fc;
int ret;
pr_debug("virtio-fs: worker %s called.\n", __func__);
@@ -304,7 +354,7 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work)
list_del_init(&req->list);
spin_unlock(&fsvq->lock);
- fuse_request_end(fc, req);
+ fuse_request_end(req);
}
/* Dispatch pending requests */
@@ -335,7 +385,7 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work)
spin_unlock(&fsvq->lock);
pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n",
ret);
- fuse_request_end(fc, req);
+ fuse_request_end(req);
}
}
}
@@ -495,7 +545,6 @@ static void virtio_fs_request_complete(struct fuse_req *req,
struct virtio_fs_vq *fsvq)
{
struct fuse_pqueue *fpq = &fsvq->fud->pq;
- struct fuse_conn *fc = fsvq->fud->fc;
struct fuse_args *args;
struct fuse_args_pages *ap;
unsigned int len, i, thislen;
@@ -528,7 +577,7 @@ static void virtio_fs_request_complete(struct fuse_req *req,
clear_bit(FR_SENT, &req->flags);
spin_unlock(&fpq->lock);
- fuse_request_end(fc, req);
+ fuse_request_end(req);
spin_lock(&fsvq->lock);
dec_in_flight_req(fsvq);
spin_unlock(&fsvq->lock);
@@ -596,6 +645,26 @@ static void virtio_fs_vq_done(struct virtqueue *vq)
schedule_work(&fsvq->done_work);
}
+static void virtio_fs_init_vq(struct virtio_fs_vq *fsvq, char *name,
+ int vq_type)
+{
+ strncpy(fsvq->name, name, VQ_NAME_LEN);
+ spin_lock_init(&fsvq->lock);
+ INIT_LIST_HEAD(&fsvq->queued_reqs);
+ INIT_LIST_HEAD(&fsvq->end_reqs);
+ init_completion(&fsvq->in_flight_zero);
+
+ if (vq_type == VQ_REQUEST) {
+ INIT_WORK(&fsvq->done_work, virtio_fs_requests_done_work);
+ INIT_DELAYED_WORK(&fsvq->dispatch_work,
+ virtio_fs_request_dispatch_work);
+ } else {
+ INIT_WORK(&fsvq->done_work, virtio_fs_hiprio_done_work);
+ INIT_DELAYED_WORK(&fsvq->dispatch_work,
+ virtio_fs_hiprio_dispatch_work);
+ }
+}
+
/* Initialize virtqueues */
static int virtio_fs_setup_vqs(struct virtio_device *vdev,
struct virtio_fs *fs)
@@ -611,7 +680,7 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
if (fs->num_request_queues == 0)
return -EINVAL;
- fs->nvqs = 1 + fs->num_request_queues;
+ fs->nvqs = VQ_REQUEST + fs->num_request_queues;
fs->vqs = kcalloc(fs->nvqs, sizeof(fs->vqs[VQ_HIPRIO]), GFP_KERNEL);
if (!fs->vqs)
return -ENOMEM;
@@ -625,29 +694,17 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
goto out;
}
+ /* Initialize the hiprio/forget request virtqueue */
callbacks[VQ_HIPRIO] = virtio_fs_vq_done;
- snprintf(fs->vqs[VQ_HIPRIO].name, sizeof(fs->vqs[VQ_HIPRIO].name),
- "hiprio");
+ virtio_fs_init_vq(&fs->vqs[VQ_HIPRIO], "hiprio", VQ_HIPRIO);
names[VQ_HIPRIO] = fs->vqs[VQ_HIPRIO].name;
- INIT_WORK(&fs->vqs[VQ_HIPRIO].done_work, virtio_fs_hiprio_done_work);
- INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].queued_reqs);
- INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].end_reqs);
- INIT_DELAYED_WORK(&fs->vqs[VQ_HIPRIO].dispatch_work,
- virtio_fs_hiprio_dispatch_work);
- init_completion(&fs->vqs[VQ_HIPRIO].in_flight_zero);
- spin_lock_init(&fs->vqs[VQ_HIPRIO].lock);
/* Initialize the requests virtqueues */
for (i = VQ_REQUEST; i < fs->nvqs; i++) {
- spin_lock_init(&fs->vqs[i].lock);
- INIT_WORK(&fs->vqs[i].done_work, virtio_fs_requests_done_work);
- INIT_DELAYED_WORK(&fs->vqs[i].dispatch_work,
- virtio_fs_request_dispatch_work);
- INIT_LIST_HEAD(&fs->vqs[i].queued_reqs);
- INIT_LIST_HEAD(&fs->vqs[i].end_reqs);
- init_completion(&fs->vqs[i].in_flight_zero);
- snprintf(fs->vqs[i].name, sizeof(fs->vqs[i].name),
- "requests.%u", i - VQ_REQUEST);
+ char vq_name[VQ_NAME_LEN];
+
+ snprintf(vq_name, VQ_NAME_LEN, "requests.%u", i - VQ_REQUEST);
+ virtio_fs_init_vq(&fs->vqs[i], vq_name, VQ_REQUEST);
callbacks[i] = virtio_fs_vq_done;
names[i] = fs->vqs[i].name;
}
@@ -676,6 +733,130 @@ static void virtio_fs_cleanup_vqs(struct virtio_device *vdev,
vdev->config->del_vqs(vdev);
}
+/* Map a window offset to a page frame number. The window offset will have
+ * been produced by .iomap_begin(), which maps a file offset to a window
+ * offset.
+ */
+static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
+ long nr_pages, void **kaddr, pfn_t *pfn)
+{
+ struct virtio_fs *fs = dax_get_private(dax_dev);
+ phys_addr_t offset = PFN_PHYS(pgoff);
+ size_t max_nr_pages = fs->window_len/PAGE_SIZE - pgoff;
+
+ if (kaddr)
+ *kaddr = fs->window_kaddr + offset;
+ if (pfn)
+ *pfn = phys_to_pfn_t(fs->window_phys_addr + offset,
+ PFN_DEV | PFN_MAP);
+ return nr_pages > max_nr_pages ? max_nr_pages : nr_pages;
+}
+
+static size_t virtio_fs_copy_from_iter(struct dax_device *dax_dev,
+ pgoff_t pgoff, void *addr,
+ size_t bytes, struct iov_iter *i)
+{
+ return copy_from_iter(addr, bytes, i);
+}
+
+static size_t virtio_fs_copy_to_iter(struct dax_device *dax_dev,
+ pgoff_t pgoff, void *addr,
+ size_t bytes, struct iov_iter *i)
+{
+ return copy_to_iter(addr, bytes, i);
+}
+
+static int virtio_fs_zero_page_range(struct dax_device *dax_dev,
+ pgoff_t pgoff, size_t nr_pages)
+{
+ long rc;
+ void *kaddr;
+
+ rc = dax_direct_access(dax_dev, pgoff, nr_pages, &kaddr, NULL);
+ if (rc < 0)
+ return rc;
+ memset(kaddr, 0, nr_pages << PAGE_SHIFT);
+ dax_flush(dax_dev, kaddr, nr_pages << PAGE_SHIFT);
+ return 0;
+}
+
+static const struct dax_operations virtio_fs_dax_ops = {
+ .direct_access = virtio_fs_direct_access,
+ .copy_from_iter = virtio_fs_copy_from_iter,
+ .copy_to_iter = virtio_fs_copy_to_iter,
+ .zero_page_range = virtio_fs_zero_page_range,
+};
+
+static void virtio_fs_cleanup_dax(void *data)
+{
+ struct dax_device *dax_dev = data;
+
+ kill_dax(dax_dev);
+ put_dax(dax_dev);
+}
+
+static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
+{
+ struct virtio_shm_region cache_reg;
+ struct dev_pagemap *pgmap;
+ bool have_cache;
+
+ if (!IS_ENABLED(CONFIG_FUSE_DAX))
+ return 0;
+
+ /* Get cache region */
+ have_cache = virtio_get_shm_region(vdev, &cache_reg,
+ (u8)VIRTIO_FS_SHMCAP_ID_CACHE);
+ if (!have_cache) {
+ dev_notice(&vdev->dev, "%s: No cache capability\n", __func__);
+ return 0;
+ }
+
+ if (!devm_request_mem_region(&vdev->dev, cache_reg.addr, cache_reg.len,
+ dev_name(&vdev->dev))) {
+ dev_warn(&vdev->dev, "could not reserve region addr=0x%llx len=0x%llx\n",
+ cache_reg.addr, cache_reg.len);
+ return -EBUSY;
+ }
+
+ dev_notice(&vdev->dev, "Cache len: 0x%llx @ 0x%llx\n", cache_reg.len,
+ cache_reg.addr);
+
+ pgmap = devm_kzalloc(&vdev->dev, sizeof(*pgmap), GFP_KERNEL);
+ if (!pgmap)
+ return -ENOMEM;
+
+ pgmap->type = MEMORY_DEVICE_FS_DAX;
+
+ /* Ideally we would directly use the PCI BAR resource but
+ * devm_memremap_pages() wants its own copy in pgmap. So
+ * initialize a struct resource from scratch (only the start
+ * and end fields will be used).
+ */
+ pgmap->range = (struct range) {
+ .start = (phys_addr_t) cache_reg.addr,
+ .end = (phys_addr_t) cache_reg.addr + cache_reg.len - 1,
+ };
+ pgmap->nr_range = 1;
+
+ fs->window_kaddr = devm_memremap_pages(&vdev->dev, pgmap);
+ if (IS_ERR(fs->window_kaddr))
+ return PTR_ERR(fs->window_kaddr);
+
+ fs->window_phys_addr = (phys_addr_t) cache_reg.addr;
+ fs->window_len = (phys_addr_t) cache_reg.len;
+
+ dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len 0x%llx\n",
+ __func__, fs->window_kaddr, cache_reg.addr, cache_reg.len);
+
+ fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops, 0);
+ if (IS_ERR(fs->dax_dev))
+ return PTR_ERR(fs->dax_dev);
+
+ return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax,
+ fs->dax_dev);
+}
+
static int virtio_fs_probe(struct virtio_device *vdev)
{
struct virtio_fs *fs;
@@ -697,6 +878,10 @@ static int virtio_fs_probe(struct virtio_device *vdev)
/* TODO vq affinity */
+ ret = virtio_fs_setup_dax(vdev, fs);
+ if (ret < 0)
+ goto out_vqs;
+
/* Bring the device online in case the filesystem is mounted and
* requests need to be sent before we return.
*/
@@ -833,18 +1018,37 @@ __releases(fiq->lock)
spin_unlock(&fiq->lock);
}
+/* Count number of scatter-gather elements required */
+static unsigned int sg_count_fuse_pages(struct fuse_page_desc *page_descs,
+ unsigned int num_pages,
+ unsigned int total_len)
+{
+ unsigned int i;
+ unsigned int this_len;
+
+ for (i = 0; i < num_pages && total_len; i++) {
+ this_len = min(page_descs[i].length, total_len);
+ total_len -= this_len;
+ }
+
+ return i;
+}
+
/* Return the number of scatter-gather list elements required */
static unsigned int sg_count_fuse_req(struct fuse_req *req)
{
struct fuse_args *args = req->args;
struct fuse_args_pages *ap = container_of(args, typeof(*ap), args);
- unsigned int total_sgs = 1 /* fuse_in_header */;
+ unsigned int size, total_sgs = 1 /* fuse_in_header */;
if (args->in_numargs - args->in_pages)
total_sgs += 1;
- if (args->in_pages)
- total_sgs += ap->num_pages;
+ if (args->in_pages) {
+ size = args->in_args[args->in_numargs - 1].size;
+ total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages,
+ size);
+ }
if (!test_bit(FR_ISREPLY, &req->flags))
return total_sgs;
@@ -854,8 +1058,11 @@ static unsigned int sg_count_fuse_req(struct fuse_req *req)
if (args->out_numargs - args->out_pages)
total_sgs += 1;
- if (args->out_pages)
- total_sgs += ap->num_pages;
+ if (args->out_pages) {
+ size = args->out_args[args->out_numargs - 1].size;
+ total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages,
+ size);
+ }
return total_sgs;
}
@@ -1071,24 +1278,28 @@ static const struct fuse_iqueue_ops virtio_fs_fiq_ops = {
.release = virtio_fs_fiq_release,
};
-static int virtio_fs_fill_super(struct super_block *sb)
+static inline void virtio_fs_ctx_set_defaults(struct fuse_fs_context *ctx)
{
- struct fuse_conn *fc = get_fuse_conn_super(sb);
+ ctx->rootmode = S_IFDIR;
+ ctx->default_permissions = 1;
+ ctx->allow_other = 1;
+ ctx->max_read = UINT_MAX;
+ ctx->blksize = 512;
+ ctx->destroy = true;
+ ctx->no_control = true;
+ ctx->no_force_umount = true;
+}
+
+static int virtio_fs_fill_super(struct super_block *sb, struct fs_context *fsc)
+{
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ struct fuse_conn *fc = fm->fc;
struct virtio_fs *fs = fc->iq.priv;
+ struct fuse_fs_context *ctx = fsc->fs_private;
unsigned int i;
int err;
- struct fuse_fs_context ctx = {
- .rootmode = S_IFDIR,
- .default_permissions = 1,
- .allow_other = 1,
- .max_read = UINT_MAX,
- .blksize = 512,
- .destroy = true,
- .no_control = true,
- .no_force_umount = true,
- .no_mount_options = true,
- };
+ virtio_fs_ctx_set_defaults(ctx);
mutex_lock(&virtio_fs_mutex);
/* After holding mutex, make sure virtiofs device is still there.
@@ -1112,8 +1323,10 @@ static int virtio_fs_fill_super(struct super_block *sb)
}
/* virtiofs allocates and installs its own fuse devices */
- ctx.fudptr = NULL;
- err = fuse_fill_super_common(sb, &ctx);
+ ctx->fudptr = NULL;
+ if (ctx->dax)
+ ctx->dax_dev = fs->dax_dev;
+ err = fuse_fill_super_common(sb, ctx);
if (err < 0)
goto err_free_fuse_devs;
@@ -1125,7 +1338,7 @@ static int virtio_fs_fill_super(struct super_block *sb)
/* Previous unmount will stop all queues. Start these again */
virtio_fs_start_all_queues(fs);
- fuse_send_init(fc);
+ fuse_send_init(fm);
mutex_unlock(&virtio_fs_mutex);
return 0;
@@ -1136,18 +1349,17 @@ err:
return err;
}
-static void virtio_kill_sb(struct super_block *sb)
+static void virtio_fs_conn_destroy(struct fuse_mount *fm)
{
- struct fuse_conn *fc = get_fuse_conn_super(sb);
- struct virtio_fs *vfs;
- struct virtio_fs_vq *fsvq;
-
- /* If mount failed, we can still be called without any fc */
- if (!fc)
- return fuse_kill_sb_anon(sb);
+ struct fuse_conn *fc = fm->fc;
+ struct virtio_fs *vfs = fc->iq.priv;
+ struct virtio_fs_vq *fsvq = &vfs->vqs[VQ_HIPRIO];
- vfs = fc->iq.priv;
- fsvq = &vfs->vqs[VQ_HIPRIO];
+ /* Stop dax worker. Soon evict_inodes() will be called which
+ * will free all memory ranges belonging to all inodes.
+ */
+ if (IS_ENABLED(CONFIG_FUSE_DAX))
+ fuse_dax_cancel_work(fc);
/* Stop forget queue. Soon destroy will be sent */
spin_lock(&fsvq->lock);
@@ -1155,9 +1367,9 @@ static void virtio_kill_sb(struct super_block *sb)
spin_unlock(&fsvq->lock);
virtio_fs_drain_all_queues(vfs);
- fuse_kill_sb_anon(sb);
+ fuse_conn_destroy(fm);
- /* fuse_kill_sb_anon() must have sent destroy. Stop all queues
+ /* fuse_conn_destroy() must have sent destroy. Stop all queues
* and drain one more time and free fuse devices. Freeing fuse
* devices will drop their reference on fuse_conn and that in
* turn will drop its reference on virtio_fs object.
@@ -1167,12 +1379,27 @@ static void virtio_kill_sb(struct super_block *sb)
virtio_fs_free_devs(vfs);
}
+static void virtio_kill_sb(struct super_block *sb)
+{
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ bool last;
+
+ /* If mount failed, we can still be called without any fc */
+ if (fm) {
+ last = fuse_mount_remove(fm);
+ if (last)
+ virtio_fs_conn_destroy(fm);
+ }
+ kill_anon_super(sb);
+}
+
static int virtio_fs_test_super(struct super_block *sb,
struct fs_context *fsc)
{
- struct fuse_conn *fc = fsc->s_fs_info;
+ struct fuse_mount *fsc_fm = fsc->s_fs_info;
+ struct fuse_mount *sb_fm = get_fuse_mount_super(sb);
- return fc->iq.priv == get_fuse_conn_super(sb)->iq.priv;
+ return fsc_fm->fc->iq.priv == sb_fm->fc->iq.priv;
}
static int virtio_fs_set_super(struct super_block *sb,
@@ -1182,7 +1409,7 @@ static int virtio_fs_set_super(struct super_block *sb,
err = get_anon_bdev(&sb->s_dev);
if (!err)
- fuse_conn_get(fsc->s_fs_info);
+ fuse_mount_get(fsc->s_fs_info);
return err;
}
@@ -1192,6 +1419,7 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
struct virtio_fs *fs;
struct super_block *sb;
struct fuse_conn *fc;
+ struct fuse_mount *fm;
int err;
/* This gets a reference on virtio_fs object. This ptr gets installed
@@ -1212,19 +1440,29 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
return -ENOMEM;
}
- fuse_conn_init(fc, get_user_ns(current_user_ns()), &virtio_fs_fiq_ops,
- fs);
+ fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL);
+ if (!fm) {
+ mutex_lock(&virtio_fs_mutex);
+ virtio_fs_put(fs);
+ mutex_unlock(&virtio_fs_mutex);
+ kfree(fc);
+ return -ENOMEM;
+ }
+
+ fuse_conn_init(fc, fm, get_user_ns(current_user_ns()),
+ &virtio_fs_fiq_ops, fs);
fc->release = fuse_free_conn;
fc->delete_stale = true;
+ fc->auto_submounts = true;
- fsc->s_fs_info = fc;
+ fsc->s_fs_info = fm;
sb = sget_fc(fsc, virtio_fs_test_super, virtio_fs_set_super);
- fuse_conn_put(fc);
+ fuse_mount_put(fm);
if (IS_ERR(sb))
return PTR_ERR(sb);
if (!sb->s_root) {
- err = virtio_fs_fill_super(sb);
+ err = virtio_fs_fill_super(sb, fsc);
if (err) {
deactivate_locked_super(sb);
return err;
@@ -1239,11 +1477,19 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
}
static const struct fs_context_operations virtio_fs_context_ops = {
+ .free = virtio_fs_free_fc,
+ .parse_param = virtio_fs_parse_param,
.get_tree = virtio_fs_get_tree,
};
static int virtio_fs_init_fs_context(struct fs_context *fsc)
{
+ struct fuse_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct fuse_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+ fsc->fs_private = ctx;
fsc->ops = &virtio_fs_context_ops;
return 0;
}
diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c
index 20d052e08b3b..371bdcbc7233 100644
--- a/fs/fuse/xattr.c
+++ b/fs/fuse/xattr.c
@@ -14,12 +14,12 @@
int fuse_setxattr(struct inode *inode, const char *name, const void *value,
size_t size, int flags)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_setxattr_in inarg;
int err;
- if (fc->no_setxattr)
+ if (fm->fc->no_setxattr)
return -EOPNOTSUPP;
memset(&inarg, 0, sizeof(inarg));
@@ -34,9 +34,9 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value,
args.in_args[1].value = name;
args.in_args[2].size = size;
args.in_args[2].value = value;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
- fc->no_setxattr = 1;
+ fm->fc->no_setxattr = 1;
err = -EOPNOTSUPP;
}
if (!err) {
@@ -49,13 +49,13 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value,
ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value,
size_t size)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_getxattr_in inarg;
struct fuse_getxattr_out outarg;
ssize_t ret;
- if (fc->no_getxattr)
+ if (fm->fc->no_getxattr)
return -EOPNOTSUPP;
memset(&inarg, 0, sizeof(inarg));
@@ -77,11 +77,11 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value,
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
}
- ret = fuse_simple_request(fc, &args);
+ ret = fuse_simple_request(fm, &args);
if (!ret && !size)
ret = min_t(ssize_t, outarg.size, XATTR_SIZE_MAX);
if (ret == -ENOSYS) {
- fc->no_getxattr = 1;
+ fm->fc->no_getxattr = 1;
ret = -EOPNOTSUPP;
}
return ret;
@@ -107,16 +107,16 @@ static int fuse_verify_xattr_list(char *list, size_t size)
ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
{
struct inode *inode = d_inode(entry);
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_getxattr_in inarg;
struct fuse_getxattr_out outarg;
ssize_t ret;
- if (!fuse_allow_current_process(fc))
+ if (!fuse_allow_current_process(fm->fc))
return -EACCES;
- if (fc->no_listxattr)
+ if (fm->fc->no_listxattr)
return -EOPNOTSUPP;
memset(&inarg, 0, sizeof(inarg));
@@ -136,13 +136,13 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
}
- ret = fuse_simple_request(fc, &args);
+ ret = fuse_simple_request(fm, &args);
if (!ret && !size)
ret = min_t(ssize_t, outarg.size, XATTR_LIST_MAX);
if (ret > 0 && size)
ret = fuse_verify_xattr_list(list, ret);
if (ret == -ENOSYS) {
- fc->no_listxattr = 1;
+ fm->fc->no_listxattr = 1;
ret = -EOPNOTSUPP;
}
return ret;
@@ -150,11 +150,11 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
int fuse_removexattr(struct inode *inode, const char *name)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
int err;
- if (fc->no_removexattr)
+ if (fm->fc->no_removexattr)
return -EOPNOTSUPP;
args.opcode = FUSE_REMOVEXATTR;
@@ -162,9 +162,9 @@ int fuse_removexattr(struct inode *inode, const char *name)
args.in_numargs = 1;
args.in_args[0].size = strlen(name) + 1;
args.in_args[0].value = name;
- err = fuse_simple_request(fc, &args);
+ err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
- fc->no_removexattr = 1;
+ fm->fc->no_removexattr = 1;
err = -EOPNOTSUPP;
}
if (!err) {
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index d4af283fc888..9cd2ecad07db 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -91,22 +91,13 @@ static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
struct inode *inode = page->mapping->host;
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
- loff_t i_size = i_size_read(inode);
- pgoff_t end_index = i_size >> PAGE_SHIFT;
- unsigned offset;
+ struct iomap_writepage_ctx wpc = { };
if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl)))
goto out;
if (current->journal_info)
goto redirty;
- /* Is the page fully outside i_size? (truncate in progress) */
- offset = i_size & (PAGE_SIZE-1);
- if (page->index > end_index || (page->index == end_index && !offset)) {
- page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
- goto out;
- }
-
- return nobh_writepage(page, gfs2_get_block_noalloc, wbc);
+ return iomap_writepage(page, wbc, &wpc, &gfs2_writeback_ops);
redirty:
redirty_page_for_writepage(wbc, page);
@@ -115,11 +106,16 @@ out:
return 0;
}
-/* This is the same as calling block_write_full_page, but it also
+/**
+ * gfs2_write_jdata_page - gfs2 jdata-specific version of block_write_full_page
+ * @page: The page to write
+ * @wbc: The writeback control
+ *
+ * This is the same as calling block_write_full_page, but it also
* writes pages outside of i_size
*/
-static int gfs2_write_full_page(struct page *page, get_block_t *get_block,
- struct writeback_control *wbc)
+static int gfs2_write_jdata_page(struct page *page,
+ struct writeback_control *wbc)
{
struct inode * const inode = page->mapping->host;
loff_t i_size = i_size_read(inode);
@@ -137,7 +133,7 @@ static int gfs2_write_full_page(struct page *page, get_block_t *get_block,
if (page->index == end_index && offset)
zero_user_segment(page, offset, PAGE_SIZE);
- return __block_write_full_page(inode, page, get_block, wbc,
+ return __block_write_full_page(inode, page, gfs2_get_block_noalloc, wbc,
end_buffer_async_write);
}
@@ -166,7 +162,7 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w
}
gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize);
}
- return gfs2_write_full_page(page, gfs2_get_block_noalloc, wbc);
+ return gfs2_write_jdata_page(page, wbc);
}
/**
@@ -208,7 +204,8 @@ static int gfs2_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
- int ret = mpage_writepages(mapping, wbc, gfs2_get_block_noalloc);
+ struct iomap_writepage_ctx wpc = { };
+ int ret;
/*
* Even if we didn't write any pages here, we might still be holding
@@ -216,9 +213,9 @@ static int gfs2_writepages(struct address_space *mapping,
* want balance_dirty_pages() to loop indefinitely trying to write out
* pages held in the ail that it can't find.
*/
+ ret = iomap_writepages(mapping, wbc, &wpc, &gfs2_writeback_ops);
if (ret == 0)
set_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags);
-
return ret;
}
@@ -470,12 +467,13 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
static int __gfs2_readpage(void *file, struct page *page)
{
- struct gfs2_inode *ip = GFS2_I(page->mapping->host);
- struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+ struct inode *inode = page->mapping->host;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
int error;
- if (i_blocksize(page->mapping->host) == PAGE_SIZE &&
- !page_has_buffers(page)) {
+ if (!gfs2_is_jdata(ip) ||
+ (i_blocksize(inode) == PAGE_SIZE && !page_has_buffers(page))) {
error = iomap_readpage(page, &gfs2_iomap_ops);
} else if (gfs2_is_stuffed(ip)) {
error = stuffed_readpage(ip, page);
@@ -563,8 +561,12 @@ static void gfs2_readahead(struct readahead_control *rac)
struct inode *inode = rac->mapping->host;
struct gfs2_inode *ip = GFS2_I(inode);
- if (!gfs2_is_stuffed(ip))
+ if (gfs2_is_stuffed(ip))
+ ;
+ else if (gfs2_is_jdata(ip))
mpage_readahead(rac, gfs2_block_map);
+ else
+ iomap_readahead(rac, &gfs2_iomap_ops);
}
/**
@@ -621,7 +623,8 @@ out:
static int jdata_set_page_dirty(struct page *page)
{
- SetPageChecked(page);
+ if (current->journal_info)
+ SetPageChecked(page);
return __set_page_dirty_buffers(page);
}
@@ -663,8 +666,11 @@ static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh)
if (bd) {
if (!list_empty(&bd->bd_list) && !buffer_pinned(bh))
list_del_init(&bd->bd_list);
- else
+ else {
+ spin_lock(&sdp->sd_ail_lock);
gfs2_remove_from_journal(bh, REMOVE_JDATA);
+ spin_unlock(&sdp->sd_ail_lock);
+ }
}
bh->b_bdev = NULL;
clear_buffer_mapped(bh);
@@ -736,7 +742,6 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
*/
gfs2_log_lock(sdp);
- spin_lock(&sdp->sd_ail_lock);
head = bh = page_buffers(page);
do {
if (atomic_read(&bh->b_count))
@@ -748,7 +753,6 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
goto cannot_release;
bh = bh->b_this_page;
} while(bh != head);
- spin_unlock(&sdp->sd_ail_lock);
head = bh = page_buffers(page);
do {
@@ -774,7 +778,6 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
return try_to_free_buffers(page);
cannot_release:
- spin_unlock(&sdp->sd_ail_lock);
gfs2_log_unlock(sdp);
return 0;
}
@@ -784,12 +787,13 @@ static const struct address_space_operations gfs2_aops = {
.writepages = gfs2_writepages,
.readpage = gfs2_readpage,
.readahead = gfs2_readahead,
+ .set_page_dirty = iomap_set_page_dirty,
+ .releasepage = iomap_releasepage,
+ .invalidatepage = iomap_invalidatepage,
.bmap = gfs2_bmap,
- .invalidatepage = gfs2_invalidatepage,
- .releasepage = gfs2_releasepage,
.direct_IO = noop_direct_IO,
- .migratepage = buffer_migrate_page,
- .is_partially_uptodate = block_is_partially_uptodate,
+ .migratepage = iomap_migrate_page,
+ .is_partially_uptodate = iomap_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 0f69fbd4af66..8dff9cbd0a87 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -56,7 +56,6 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
u64 block, struct page *page)
{
struct inode *inode = &ip->i_inode;
- struct buffer_head *bh;
int release = 0;
if (!page || page->index) {
@@ -80,20 +79,21 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
SetPageUptodate(page);
}
- if (!page_has_buffers(page))
- create_empty_buffers(page, BIT(inode->i_blkbits),
- BIT(BH_Uptodate));
+ if (gfs2_is_jdata(ip)) {
+ struct buffer_head *bh;
- bh = page_buffers(page);
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, BIT(inode->i_blkbits),
+ BIT(BH_Uptodate));
- if (!buffer_mapped(bh))
- map_bh(bh, inode->i_sb, block);
+ bh = page_buffers(page);
+ if (!buffer_mapped(bh))
+ map_bh(bh, inode->i_sb, block);
- set_buffer_uptodate(bh);
- if (gfs2_is_jdata(ip))
+ set_buffer_uptodate(bh);
gfs2_trans_add_data(ip->i_gl, bh);
- else {
- mark_buffer_dirty(bh);
+ } else {
+ set_page_dirty(page);
gfs2_ordered_add_inode(ip);
}
@@ -1158,7 +1158,8 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
struct metapath mp = { .mp_aheight = 1, };
int ret;
- iomap->flags |= IOMAP_F_BUFFER_HEAD;
+ if (gfs2_is_jdata(ip))
+ iomap->flags |= IOMAP_F_BUFFER_HEAD;
trace_gfs2_iomap_start(ip, pos, length, flags);
if (gfs2_iomap_need_write_lock(flags)) {
@@ -1291,6 +1292,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
loff_t length = bh_map->b_size;
struct metapath mp = { .mp_aheight = 1, };
struct iomap iomap = { };
+ int flags = create ? IOMAP_WRITE : 0;
int ret;
clear_buffer_mapped(bh_map);
@@ -1298,15 +1300,14 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
clear_buffer_boundary(bh_map);
trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
- if (create) {
- ret = gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, &iomap, &mp);
- if (!ret && iomap.type == IOMAP_HOLE)
+ ret = gfs2_iomap_get(inode, pos, length, flags, &iomap, &mp);
+ if (!ret && iomap.type == IOMAP_HOLE) {
+ if (create)
ret = gfs2_iomap_alloc(inode, &iomap, &mp);
- release_metapath(&mp);
- } else {
- ret = gfs2_iomap_get(inode, pos, length, 0, &iomap, &mp);
- release_metapath(&mp);
+ else
+ ret = -ENODATA;
}
+ release_metapath(&mp);
if (ret)
goto out;
@@ -2518,3 +2519,26 @@ out:
gfs2_trans_end(sdp);
return error;
}
+
+static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode,
+ loff_t offset)
+{
+ struct metapath mp = { .mp_aheight = 1, };
+ int ret;
+
+ if (WARN_ON_ONCE(gfs2_is_stuffed(GFS2_I(inode))))
+ return -EIO;
+
+ if (offset >= wpc->iomap.offset &&
+ offset < wpc->iomap.offset + wpc->iomap.length)
+ return 0;
+
+ memset(&wpc->iomap, 0, sizeof(wpc->iomap));
+ ret = gfs2_iomap_get(inode, offset, INT_MAX, 0, &wpc->iomap, &mp);
+ release_metapath(&mp);
+ return ret;
+}
+
+const struct iomap_writeback_ops gfs2_writeback_ops = {
+ .map_blocks = gfs2_map_blocks,
+};
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index b88fd45ab79f..aed4632d47d3 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -44,6 +44,7 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
}
extern const struct iomap_ops gfs2_iomap_ops;
+extern const struct iomap_writeback_ops gfs2_writeback_ops;
extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
extern int gfs2_block_map(struct inode *inode, sector_t lblock,
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index f13b136654ca..5441c17562c5 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -270,7 +270,12 @@ static void __gfs2_glock_put(struct gfs2_glock *gl)
gfs2_glock_remove_from_lru(gl);
spin_unlock(&gl->gl_lockref.lock);
GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
- GLOCK_BUG_ON(gl, mapping && mapping->nrpages && !gfs2_withdrawn(sdp));
+ if (mapping) {
+ truncate_inode_pages_final(mapping);
+ if (!gfs2_withdrawn(sdp))
+ GLOCK_BUG_ON(gl, mapping->nrpages ||
+ mapping->nrexceptional);
+ }
trace_gfs2_glock_put(gl);
sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
}
@@ -453,9 +458,6 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
else
gl->gl_lockref.count--;
}
- if (held1 && held2 && list_empty(&gl->gl_holders))
- clear_bit(GLF_QUEUED, &gl->gl_flags);
-
if (new_state != gl->gl_target)
/* shorten our minimum hold time */
gl->gl_hold_time = max(gl->gl_hold_time - GL_GLOCK_HOLD_DECR,
@@ -1049,7 +1051,8 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
gl->gl_object = NULL;
gl->gl_hold_time = GL_GLOCK_DFT_HOLD;
INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
- INIT_DELAYED_WORK(&gl->gl_delete, delete_work_func);
+ if (gl->gl_name.ln_type == LM_TYPE_IOPEN)
+ INIT_DELAYED_WORK(&gl->gl_delete, delete_work_func);
mapping = gfs2_glock2aspace(gl);
if (mapping) {
@@ -1345,7 +1348,6 @@ fail:
if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt))
insert_pt = &gh2->gh_list;
}
- set_bit(GLF_QUEUED, &gl->gl_flags);
trace_gfs2_glock_queue(gh, 1);
gfs2_glstats_inc(gl, GFS2_LKS_QCOUNT);
gfs2_sbstats_inc(gl, GFS2_LKS_QCOUNT);
@@ -1646,16 +1648,15 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
unsigned long now = jiffies;
gfs2_glock_hold(gl);
+ spin_lock(&gl->gl_lockref.lock);
holdtime = gl->gl_tchange + gl->gl_hold_time;
- if (test_bit(GLF_QUEUED, &gl->gl_flags) &&
+ if (!list_empty(&gl->gl_holders) &&
gl->gl_name.ln_type == LM_TYPE_INODE) {
if (time_before(now, holdtime))
delay = holdtime - now;
if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
delay = gl->gl_hold_time;
}
-
- spin_lock(&gl->gl_lockref.lock);
handle_callback(gl, state, delay, true);
__gfs2_glock_queue_work(gl, delay);
spin_unlock(&gl->gl_lockref.lock);
@@ -1842,10 +1843,9 @@ static struct shrinker glock_shrinker = {
};
/**
- * examine_bucket - Call a function for glock in a hash bucket
+ * glock_hash_walk - Call a function for glock in a hash bucket
* @examiner: the function
* @sdp: the filesystem
- * @bucket: the bucket
*
* Note that the function can be called multiple times on the same
* object. So the user must ensure that the function can cope with
@@ -1901,9 +1901,11 @@ bool gfs2_delete_work_queued(const struct gfs2_glock *gl)
static void flush_delete_work(struct gfs2_glock *gl)
{
- if (cancel_delayed_work(&gl->gl_delete)) {
- queue_delayed_work(gfs2_delete_workqueue,
- &gl->gl_delete, 0);
+ if (gl->gl_name.ln_type == LM_TYPE_IOPEN) {
+ if (cancel_delayed_work(&gl->gl_delete)) {
+ queue_delayed_work(gfs2_delete_workqueue,
+ &gl->gl_delete, 0);
+ }
}
gfs2_glock_queue_work(gl, 0);
}
@@ -2100,7 +2102,7 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
*p++ = 'I';
if (test_bit(GLF_FROZEN, gflags))
*p++ = 'F';
- if (test_bit(GLF_QUEUED, gflags))
+ if (!list_empty(&gl->gl_holders))
*p++ = 'q';
if (test_bit(GLF_LRU, gflags))
*p++ = 'L';
@@ -2415,7 +2417,7 @@ static const struct seq_operations gfs2_glstats_seq_ops = {
.show = gfs2_glstats_seq_show,
};
-static const struct seq_operations gfs2_sbstats_seq_ops = {
+static const struct seq_operations gfs2_sbstats_sops = {
.start = gfs2_sbstats_seq_start,
.next = gfs2_sbstats_seq_next,
.stop = gfs2_sbstats_seq_stop,
@@ -2468,16 +2470,6 @@ static int gfs2_glstats_open(struct inode *inode, struct file *file)
return __gfs2_glocks_open(inode, file, &gfs2_glstats_seq_ops);
}
-static int gfs2_sbstats_open(struct inode *inode, struct file *file)
-{
- int ret = seq_open(file, &gfs2_sbstats_seq_ops);
- if (ret == 0) {
- struct seq_file *seq = file->private_data;
- seq->private = inode->i_private; /* sdp */
- }
- return ret;
-}
-
static const struct file_operations gfs2_glocks_fops = {
.owner = THIS_MODULE,
.open = gfs2_glocks_open,
@@ -2494,13 +2486,7 @@ static const struct file_operations gfs2_glstats_fops = {
.release = gfs2_glocks_release,
};
-static const struct file_operations gfs2_sbstats_fops = {
- .owner = THIS_MODULE,
- .open = gfs2_sbstats_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
+DEFINE_SEQ_ATTRIBUTE(gfs2_sbstats);
void gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
{
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index de1d5f1d9ff8..aa3f5236befb 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -178,6 +178,9 @@ static int rgrp_go_sync(struct gfs2_glock *gl)
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct address_space *mapping = &sdp->sd_aspace;
struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl);
+ const unsigned bsize = sdp->sd_sb.sb_bsize;
+ loff_t start = (rgd->rd_addr * bsize) & PAGE_MASK;
+ loff_t end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1;
int error;
if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
@@ -186,18 +189,13 @@ static int rgrp_go_sync(struct gfs2_glock *gl)
gfs2_log_flush(sdp, gl, GFS2_LOG_HEAD_FLUSH_NORMAL |
GFS2_LFC_RGRP_GO_SYNC);
- filemap_fdatawrite_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
- error = filemap_fdatawait_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
- WARN_ON_ONCE(error);
+ filemap_fdatawrite_range(mapping, start, end);
+ error = filemap_fdatawait_range(mapping, start, end);
+ WARN_ON_ONCE(error && !gfs2_withdrawn(sdp));
mapping_set_error(mapping, error);
if (!error)
error = gfs2_ail_empty_gl(gl);
-
- spin_lock(&gl->gl_lockref.lock);
- rgd = gl->gl_object;
- if (rgd)
- gfs2_free_clones(rgd);
- spin_unlock(&gl->gl_lockref.lock);
+ gfs2_free_clones(rgd);
return error;
}
@@ -216,15 +214,23 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct address_space *mapping = &sdp->sd_aspace;
struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl);
+ const unsigned bsize = sdp->sd_sb.sb_bsize;
+ loff_t start = (rgd->rd_addr * bsize) & PAGE_MASK;
+ loff_t end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1;
- if (rgd)
- gfs2_rgrp_brelse(rgd);
-
+ gfs2_rgrp_brelse(rgd);
WARN_ON_ONCE(!(flags & DIO_METADATA));
- truncate_inode_pages_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
+ truncate_inode_pages_range(mapping, start, end);
+ rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
+}
+
+static void gfs2_rgrp_go_dump(struct seq_file *seq, struct gfs2_glock *gl,
+ const char *fs_id_buf)
+{
+ struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl);
if (rgd)
- rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
+ gfs2_rgrp_dump(seq, rgd, fs_id_buf);
}
static struct gfs2_inode *gfs2_glock2inode(struct gfs2_glock *gl)
@@ -712,7 +718,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
.go_sync = rgrp_go_sync,
.go_inval = rgrp_go_inval,
.go_lock = gfs2_rgrp_go_lock,
- .go_dump = gfs2_rgrp_dump,
+ .go_dump = gfs2_rgrp_go_dump,
.go_type = LM_TYPE_RGRP,
.go_flags = GLOF_LVB,
};
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index ca2ec02436ec..d7707307f4b1 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -41,6 +41,10 @@ struct gfs2_log_header_host {
u32 lh_flags; /* GFS2_LOG_HEAD_... */
u32 lh_tail; /* Block number of log tail */
u32 lh_blkno;
+
+ s64 lh_local_total;
+ s64 lh_local_free;
+ s64 lh_local_dinodes;
};
/*
@@ -340,7 +344,6 @@ enum {
GLF_REPLY_PENDING = 9,
GLF_INITIAL = 10,
GLF_FROZEN = 11,
- GLF_QUEUED = 12,
GLF_LRU = 13,
GLF_OBJECT = 14, /* Used only for tracing */
GLF_BLOCKING = 15,
@@ -378,17 +381,10 @@ struct gfs2_glock {
atomic_t gl_ail_count;
atomic_t gl_revokes;
struct delayed_work gl_work;
- union {
- /* For iopen glocks only */
- struct {
- struct delayed_work gl_delete;
- u64 gl_no_formal_ino;
- };
- /* For rgrp glocks only */
- struct {
- loff_t start;
- loff_t end;
- } gl_vm;
+ /* For iopen glocks only */
+ struct {
+ struct delayed_work gl_delete;
+ u64 gl_no_formal_ino;
};
struct rcu_head gl_rcu;
struct rhash_head gl_node;
@@ -701,10 +697,18 @@ struct gfs2_pcpu_lkstats {
struct gfs2_lkstats lkstats[10];
};
+/* List of local (per node) statfs inodes */
+struct local_statfs_inode {
+ struct list_head si_list;
+ struct inode *si_sc_inode;
+ unsigned int si_jid; /* journal id this statfs inode corresponds to */
+};
+
struct gfs2_sbd {
struct super_block *sd_vfs;
struct gfs2_pcpu_lkstats __percpu *sd_lkstats;
struct kobject sd_kobj;
+ struct completion sd_kobj_unregister;
unsigned long sd_flags; /* SDF_... */
struct gfs2_sb_host sd_sb;
@@ -751,6 +755,7 @@ struct gfs2_sbd {
struct inode *sd_jindex;
struct inode *sd_statfs_inode;
struct inode *sd_sc_inode;
+ struct list_head sd_sc_inodes_list;
struct inode *sd_qc_inode;
struct inode *sd_rindex;
struct inode *sd_quota_inode;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 3763c9ff1406..9133b3178677 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -70,7 +70,7 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct)
*
*/
-static void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
+void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
{
bd->bd_tr = NULL;
list_del_init(&bd->bd_ail_st_list);
@@ -244,13 +244,15 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp)
* @tr: the transaction
* @max_revokes: If nonzero, issue revokes for the bd items for written buffers
*
+ * returns: the transaction's count of remaining active items
*/
-static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
+static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
int *max_revokes)
{
struct gfs2_bufdata *bd, *s;
struct buffer_head *bh;
+ int active_count = 0;
list_for_each_entry_safe_reverse(bd, s, &tr->tr_ail1_list,
bd_ail_st_list) {
@@ -265,8 +267,10 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
* If the ail buffer is not busy and caught an error, flag it
* for others.
*/
- if (!sdp->sd_log_error && buffer_busy(bh))
+ if (!sdp->sd_log_error && buffer_busy(bh)) {
+ active_count++;
continue;
+ }
if (!buffer_uptodate(bh) &&
!cmpxchg(&sdp->sd_log_error, 0, -EIO)) {
gfs2_io_error_bh(sdp, bh);
@@ -285,6 +289,7 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
}
list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list);
}
+ return active_count;
}
/**
@@ -303,8 +308,7 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int max_revokes)
spin_lock(&sdp->sd_ail_lock);
list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) {
- gfs2_ail1_empty_one(sdp, tr, &max_revokes);
- if (list_empty(&tr->tr_ail1_list) && oldest_tr)
+ if (!gfs2_ail1_empty_one(sdp, tr, &max_revokes) && oldest_tr)
list_move(&tr->tr_list, &sdp->sd_ail2_list);
else
oldest_tr = 0;
@@ -716,16 +720,24 @@ void gfs2_write_revokes(struct gfs2_sbd *sdp)
atomic_dec(&sdp->sd_log_blks_free);
/* If no blocks have been reserved, we need to also
* reserve a block for the header */
- if (!sdp->sd_log_blks_reserved)
+ if (!sdp->sd_log_blks_reserved) {
atomic_dec(&sdp->sd_log_blks_free);
+ trace_gfs2_log_blocks(sdp, -2);
+ } else {
+ trace_gfs2_log_blocks(sdp, -1);
+ }
}
gfs2_ail1_empty(sdp, max_revokes);
gfs2_log_unlock(sdp);
if (!sdp->sd_log_num_revoke) {
atomic_inc(&sdp->sd_log_blks_free);
- if (!sdp->sd_log_blks_reserved)
+ if (!sdp->sd_log_blks_reserved) {
atomic_inc(&sdp->sd_log_blks_free);
+ trace_gfs2_log_blocks(sdp, 2);
+ } else {
+ trace_gfs2_log_blocks(sdp, 1);
+ }
}
}
@@ -902,7 +914,7 @@ static void empty_ail1_list(struct gfs2_sbd *sdp)
}
/**
- * drain_bd - drain the buf and databuf queue for a failed transaction
+ * trans_drain - drain the buf and databuf queue for a failed transaction
* @tr: the transaction to drain
*
* When this is called, we're taking an error exit for a log write that failed
@@ -954,10 +966,8 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
goto out;
/* Log might have been flushed while we waited for the flush lock */
- if (gl && !test_bit(GLF_LFLUSH, &gl->gl_flags)) {
- up_write(&sdp->sd_log_flush_lock);
- return;
- }
+ if (gl && !test_bit(GLF_LFLUSH, &gl->gl_flags))
+ goto out;
trace_gfs2_log_flush(sdp, 1, flags);
if (flags & GFS2_LOG_HEAD_FLUSH_SHUTDOWN)
@@ -971,25 +981,25 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
if (unlikely (state == SFS_FROZEN))
if (gfs2_assert_withdraw_delayed(sdp,
!tr->tr_num_buf_new && !tr->tr_num_databuf_new))
- goto out;
+ goto out_withdraw;
}
if (unlikely(state == SFS_FROZEN))
if (gfs2_assert_withdraw_delayed(sdp, !sdp->sd_log_num_revoke))
- goto out;
+ goto out_withdraw;
if (gfs2_assert_withdraw_delayed(sdp,
sdp->sd_log_num_revoke == sdp->sd_log_committed_revoke))
- goto out;
+ goto out_withdraw;
gfs2_ordered_write(sdp);
if (gfs2_withdrawn(sdp))
- goto out;
+ goto out_withdraw;
lops_before_commit(sdp, tr);
if (gfs2_withdrawn(sdp))
- goto out;
+ goto out_withdraw;
gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE);
if (gfs2_withdrawn(sdp))
- goto out;
+ goto out_withdraw;
if (sdp->sd_log_head != sdp->sd_log_flush_head) {
log_flush_wait(sdp);
@@ -1000,7 +1010,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
log_write_header(sdp, flags);
}
if (gfs2_withdrawn(sdp))
- goto out;
+ goto out_withdraw;
lops_after_commit(sdp, tr);
gfs2_log_lock(sdp);
@@ -1020,7 +1030,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
if (!sdp->sd_log_idle) {
empty_ail1_list(sdp);
if (gfs2_withdrawn(sdp))
- goto out;
+ goto out_withdraw;
atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */
trace_gfs2_log_blocks(sdp, -1);
log_write_header(sdp, flags);
@@ -1033,27 +1043,30 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
atomic_set(&sdp->sd_freeze_state, SFS_FROZEN);
}
-out:
- if (gfs2_withdrawn(sdp)) {
- trans_drain(tr);
- /**
- * If the tr_list is empty, we're withdrawing during a log
- * flush that targets a transaction, but the transaction was
- * never queued onto any of the ail lists. Here we add it to
- * ail1 just so that ail_drain() will find and free it.
- */
- spin_lock(&sdp->sd_ail_lock);
- if (tr && list_empty(&tr->tr_list))
- list_add(&tr->tr_list, &sdp->sd_ail1_list);
- spin_unlock(&sdp->sd_ail_lock);
- ail_drain(sdp); /* frees all transactions */
- tr = NULL;
- }
-
+out_end:
trace_gfs2_log_flush(sdp, 0, flags);
+out:
up_write(&sdp->sd_log_flush_lock);
-
gfs2_trans_free(sdp, tr);
+ if (gfs2_withdrawing(sdp))
+ gfs2_withdraw(sdp);
+ return;
+
+out_withdraw:
+ trans_drain(tr);
+ /**
+ * If the tr_list is empty, we're withdrawing during a log
+ * flush that targets a transaction, but the transaction was
+ * never queued onto any of the ail lists. Here we add it to
+ * ail1 just so that ail_drain() will find and free it.
+ */
+ spin_lock(&sdp->sd_ail_lock);
+ if (tr && list_empty(&tr->tr_list))
+ list_add(&tr->tr_list, &sdp->sd_ail1_list);
+ spin_unlock(&sdp->sd_ail_lock);
+ ail_drain(sdp); /* frees all transactions */
+ tr = NULL;
+ goto out_end;
}
/**
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 8965c751a303..79f97290146e 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -63,7 +63,7 @@ static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip)
extern void gfs2_ordered_del_inode(struct gfs2_inode *ip);
extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct);
-
+extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
extern void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
extern void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index ed1da4323967..ed69298dd824 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -823,7 +823,7 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, u32 start,
*
*/
-static void gfs2_meta_sync(struct gfs2_glock *gl)
+void gfs2_meta_sync(struct gfs2_glock *gl)
{
struct address_space *mapping = gfs2_glock2aspace(gl);
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 9c5e4e491e03..4a3d8aecdf82 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -27,6 +27,7 @@ extern void gfs2_log_submit_bio(struct bio **biop, int opf);
extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
struct gfs2_log_header_host *head, bool keep_cache);
+extern void gfs2_meta_sync(struct gfs2_glock *gl);
static inline unsigned int buf_limit(struct gfs2_sbd *sdp)
{
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 9856cc2e0795..2db573e31f78 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -348,38 +348,109 @@ void gfs2_remove_from_journal(struct buffer_head *bh, int meta)
brelse(bh);
}
if (bd) {
- spin_lock(&sdp->sd_ail_lock);
if (bd->bd_tr) {
gfs2_trans_add_revoke(sdp, bd);
} else if (was_pinned) {
bh->b_private = NULL;
kmem_cache_free(gfs2_bufdata_cachep, bd);
+ } else if (!list_empty(&bd->bd_ail_st_list) &&
+ !list_empty(&bd->bd_ail_gl_list)) {
+ gfs2_remove_from_ail(bd);
}
- spin_unlock(&sdp->sd_ail_lock);
}
clear_buffer_dirty(bh);
clear_buffer_uptodate(bh);
}
/**
- * gfs2_meta_wipe - make inode's buffers so they aren't dirty/pinned anymore
+ * gfs2_ail1_wipe - remove deleted/freed buffers from the ail1 list
+ * @sdp: superblock
+ * @bstart: starting block address of buffers to remove
+ * @blen: length of buffers to be removed
+ *
+ * This function is called from gfs2_journal wipe, whose job is to remove
+ * buffers, corresponding to deleted blocks, from the journal. If we find any
+ * bufdata elements on the system ail1 list, they haven't been written to
+ * the journal yet. So we remove them.
+ */
+static void gfs2_ail1_wipe(struct gfs2_sbd *sdp, u64 bstart, u32 blen)
+{
+ struct gfs2_trans *tr, *s;
+ struct gfs2_bufdata *bd, *bs;
+ struct buffer_head *bh;
+ u64 end = bstart + blen;
+
+ gfs2_log_lock(sdp);
+ spin_lock(&sdp->sd_ail_lock);
+ list_for_each_entry_safe(tr, s, &sdp->sd_ail1_list, tr_list) {
+ list_for_each_entry_safe(bd, bs, &tr->tr_ail1_list,
+ bd_ail_st_list) {
+ bh = bd->bd_bh;
+ if (bh->b_blocknr < bstart || bh->b_blocknr >= end)
+ continue;
+
+ gfs2_remove_from_journal(bh, REMOVE_JDATA);
+ }
+ }
+ spin_unlock(&sdp->sd_ail_lock);
+ gfs2_log_unlock(sdp);
+}
+
+static struct buffer_head *gfs2_getjdatabuf(struct gfs2_inode *ip, u64 blkno)
+{
+ struct address_space *mapping = ip->i_inode.i_mapping;
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct page *page;
+ struct buffer_head *bh;
+ unsigned int shift = PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift;
+ unsigned long index = blkno >> shift; /* convert block to page */
+ unsigned int bufnum = blkno - (index << shift);
+
+ page = find_get_page_flags(mapping, index, FGP_LOCK|FGP_ACCESSED);
+ if (!page)
+ return NULL;
+ if (!page_has_buffers(page)) {
+ unlock_page(page);
+ put_page(page);
+ return NULL;
+ }
+ /* Locate header for our buffer within our page */
+ for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page)
+ /* Do nothing */;
+ get_bh(bh);
+ unlock_page(page);
+ put_page(page);
+ return bh;
+}
+
+/**
+ * gfs2_journal_wipe - make inode's buffers so they aren't dirty/pinned anymore
* @ip: the inode who owns the buffers
* @bstart: the first buffer in the run
* @blen: the number of buffers in the run
*
*/
-void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
+void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct buffer_head *bh;
+ int ty;
+ gfs2_ail1_wipe(sdp, bstart, blen);
while (blen) {
+ ty = REMOVE_META;
bh = gfs2_getbuf(ip->i_gl, bstart, NO_CREATE);
+ if (!bh && gfs2_is_jdata(ip)) {
+ bh = gfs2_getjdatabuf(ip, bstart);
+ ty = REMOVE_JDATA;
+ }
if (bh) {
lock_buffer(bh);
gfs2_log_lock(sdp);
- gfs2_remove_from_journal(bh, REMOVE_META);
+ spin_lock(&sdp->sd_ail_lock);
+ gfs2_remove_from_journal(bh, ty);
+ spin_unlock(&sdp->sd_ail_lock);
gfs2_log_unlock(sdp);
unlock_buffer(bh);
brelse(bh);
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index eafb74e861c6..4a8c01929b79 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -60,7 +60,7 @@ enum {
};
extern void gfs2_remove_from_journal(struct buffer_head *bh, int meta);
-extern void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
+extern void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
extern int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
struct buffer_head **bhp);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 6d18d2c91add..7a7e3c10a9a9 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -110,6 +110,8 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
spin_lock_init(&sdp->sd_trunc_lock);
spin_lock_init(&sdp->sd_bitmap_lock);
+ INIT_LIST_HEAD(&sdp->sd_sc_inodes_list);
+
mapping = &sdp->sd_aspace;
address_space_init_once(mapping);
@@ -169,15 +171,19 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent)
return -EINVAL;
}
- /* If format numbers match exactly, we're done. */
-
- if (sb->sb_fs_format == GFS2_FORMAT_FS &&
- sb->sb_multihost_format == GFS2_FORMAT_MULTI)
- return 0;
+ if (sb->sb_fs_format != GFS2_FORMAT_FS ||
+ sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
+ fs_warn(sdp, "Unknown on-disk format, unable to mount\n");
+ return -EINVAL;
+ }
- fs_warn(sdp, "Unknown on-disk format, unable to mount\n");
+ if (sb->sb_bsize < 512 || sb->sb_bsize > PAGE_SIZE ||
+ (sb->sb_bsize & (sb->sb_bsize - 1))) {
+ pr_warn("Invalid superblock size\n");
+ return -EINVAL;
+ }
- return -EINVAL;
+ return 0;
}
static void end_bio_io_page(struct bio *bio)
@@ -604,6 +610,90 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
return error;
}
+/**
+ * init_statfs - look up and initialize master and local (per node) statfs inodes
+ * @sdp: The GFS2 superblock
+ *
+ * This should be called after the jindex is initialized in init_journal() and
+ * before gfs2_journal_recovery() is called because we need to be able to write
+ * to these inodes during recovery.
+ *
+ * Returns: errno
+ */
+static int init_statfs(struct gfs2_sbd *sdp)
+{
+ int error = 0;
+ struct inode *master = d_inode(sdp->sd_master_dir);
+ struct inode *pn = NULL;
+ char buf[30];
+ struct gfs2_jdesc *jd;
+ struct gfs2_inode *ip;
+
+ sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs");
+ if (IS_ERR(sdp->sd_statfs_inode)) {
+ error = PTR_ERR(sdp->sd_statfs_inode);
+ fs_err(sdp, "can't read in statfs inode: %d\n", error);
+ goto fail;
+ }
+
+ pn = gfs2_lookup_simple(master, "per_node");
+ if (IS_ERR(pn)) {
+ error = PTR_ERR(pn);
+ fs_err(sdp, "can't find per_node directory: %d\n", error);
+ goto put_statfs;
+ }
+
+ /* For each jid, lookup the corresponding local statfs inode in the
+ * per_node metafs directory and save it in the sdp->sd_sc_inodes_list. */
+ list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+ struct local_statfs_inode *lsi =
+ kmalloc(sizeof(struct local_statfs_inode), GFP_NOFS);
+ if (!lsi) {
+ error = -ENOMEM;
+ goto free_local;
+ }
+ sprintf(buf, "statfs_change%u", jd->jd_jid);
+ lsi->si_sc_inode = gfs2_lookup_simple(pn, buf);
+ if (IS_ERR(lsi->si_sc_inode)) {
+ error = PTR_ERR(lsi->si_sc_inode);
+ fs_err(sdp, "can't find local \"sc\" file#%u: %d\n",
+ jd->jd_jid, error);
+ goto free_local;
+ }
+ lsi->si_jid = jd->jd_jid;
+ if (jd->jd_jid == sdp->sd_jdesc->jd_jid)
+ sdp->sd_sc_inode = lsi->si_sc_inode;
+
+ list_add_tail(&lsi->si_list, &sdp->sd_sc_inodes_list);
+ }
+
+ iput(pn);
+ ip = GFS2_I(sdp->sd_sc_inode);
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0,
+ &sdp->sd_sc_gh);
+ if (error) {
+ fs_err(sdp, "can't lock local \"sc\" file: %d\n", error);
+ goto free_local;
+ }
+ return 0;
+
+free_local:
+ free_local_statfs_inodes(sdp);
+ iput(pn);
+put_statfs:
+ iput(sdp->sd_statfs_inode);
+fail:
+ return error;
+}
+
+/* Uninitialize and free up memory used by the list of statfs inodes */
+static void uninit_statfs(struct gfs2_sbd *sdp)
+{
+ gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
+ free_local_statfs_inodes(sdp);
+ iput(sdp->sd_statfs_inode);
+}
+
static int init_journal(struct gfs2_sbd *sdp, int undo)
{
struct inode *master = d_inode(sdp->sd_master_dir);
@@ -690,6 +780,11 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
}
trace_gfs2_log_blocks(sdp, atomic_read(&sdp->sd_log_blks_free));
+ /* Lookup statfs inodes here so journal recovery can use them. */
+ error = init_statfs(sdp);
+ if (error)
+ goto fail_jinode_gh;
+
if (sdp->sd_lockstruct.ls_first) {
unsigned int x;
for (x = 0; x < sdp->sd_journals; x++) {
@@ -698,14 +793,14 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
if (sdp->sd_args.ar_spectator) {
error = check_journal_clean(sdp, jd, true);
if (error)
- goto fail_jinode_gh;
+ goto fail_statfs;
continue;
}
error = gfs2_recover_journal(jd, true);
if (error) {
fs_err(sdp, "error recovering journal %u: %d\n",
x, error);
- goto fail_jinode_gh;
+ goto fail_statfs;
}
}
@@ -714,7 +809,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
error = gfs2_recover_journal(sdp->sd_jdesc, true);
if (error) {
fs_err(sdp, "error recovering my journal: %d\n", error);
- goto fail_jinode_gh;
+ goto fail_statfs;
}
}
@@ -725,6 +820,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
INIT_WORK(&sdp->sd_freeze_work, gfs2_freeze_func);
return 0;
+fail_statfs:
+ uninit_statfs(sdp);
fail_jinode_gh:
/* A withdraw may have done dq/uninit so now we need to check it */
if (!sdp->sd_args.ar_spectator &&
@@ -758,20 +855,12 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
if (error)
goto fail;
- /* Read in the master statfs inode */
- sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs");
- if (IS_ERR(sdp->sd_statfs_inode)) {
- error = PTR_ERR(sdp->sd_statfs_inode);
- fs_err(sdp, "can't read in statfs inode: %d\n", error);
- goto fail_journal;
- }
-
/* Read in the resource index inode */
sdp->sd_rindex = gfs2_lookup_simple(master, "rindex");
if (IS_ERR(sdp->sd_rindex)) {
error = PTR_ERR(sdp->sd_rindex);
fs_err(sdp, "can't get resource index inode: %d\n", error);
- goto fail_statfs;
+ goto fail_journal;
}
sdp->sd_rindex_uptodate = 0;
@@ -800,8 +889,6 @@ fail_qinode:
fail_rindex:
gfs2_clear_rgrpd(sdp);
iput(sdp->sd_rindex);
-fail_statfs:
- iput(sdp->sd_statfs_inode);
fail_journal:
init_journal(sdp, UNDO);
fail:
@@ -829,14 +916,6 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
return error;
}
- sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid);
- sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf);
- if (IS_ERR(sdp->sd_sc_inode)) {
- error = PTR_ERR(sdp->sd_sc_inode);
- fs_err(sdp, "can't find local \"sc\" file: %d\n", error);
- goto fail;
- }
-
sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid);
sdp->sd_qc_inode = gfs2_lookup_simple(pn, buf);
if (IS_ERR(sdp->sd_qc_inode)) {
@@ -848,33 +927,21 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
iput(pn);
pn = NULL;
- ip = GFS2_I(sdp->sd_sc_inode);
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0,
- &sdp->sd_sc_gh);
- if (error) {
- fs_err(sdp, "can't lock local \"sc\" file: %d\n", error);
- goto fail_qc_i;
- }
-
ip = GFS2_I(sdp->sd_qc_inode);
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0,
&sdp->sd_qc_gh);
if (error) {
fs_err(sdp, "can't lock local \"qc\" file: %d\n", error);
- goto fail_ut_gh;
+ goto fail_qc_i;
}
return 0;
fail_qc_gh:
gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
-fail_ut_gh:
- gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
fail_qc_i:
iput(sdp->sd_qc_inode);
fail_ut_i:
- iput(sdp->sd_sc_inode);
-fail:
iput(pn);
return error;
}
@@ -1062,26 +1129,14 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc)
}
error = init_names(sdp, silent);
- if (error) {
- /* In this case, we haven't initialized sysfs, so we have to
- manually free the sdp. */
- free_sbd(sdp);
- sb->s_fs_info = NULL;
- return error;
- }
+ if (error)
+ goto fail_free;
snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s", sdp->sd_table_name);
error = gfs2_sys_fs_add(sdp);
- /*
- * If we hit an error here, gfs2_sys_fs_add will have called function
- * kobject_put which causes the sysfs usage count to go to zero, which
- * causes sysfs to call function gfs2_sbd_release, which frees sdp.
- * Subsequent error paths here will call gfs2_sys_fs_del, which also
- * kobject_put to free sdp.
- */
if (error)
- return error;
+ goto fail_free;
gfs2_create_debugfs_file(sdp);
@@ -1179,9 +1234,9 @@ fail_lm:
gfs2_lm_unmount(sdp);
fail_debug:
gfs2_delete_debugfs_file(sdp);
- /* gfs2_sys_fs_del must be the last thing we do, since it causes
- * sysfs to call function gfs2_sbd_release, which frees sdp. */
gfs2_sys_fs_del(sdp);
+fail_free:
+ free_sbd(sdp);
sb->s_fs_info = NULL;
return error;
}
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 390ea79d682c..b5cbe21efdfb 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -144,6 +144,10 @@ int __get_log_header(struct gfs2_sbd *sdp, const struct gfs2_log_header *lh,
head->lh_tail = be32_to_cpu(lh->lh_tail);
head->lh_blkno = be32_to_cpu(lh->lh_blkno);
+ head->lh_local_total = be64_to_cpu(lh->lh_local_total);
+ head->lh_local_free = be64_to_cpu(lh->lh_local_free);
+ head->lh_local_dinodes = be64_to_cpu(lh->lh_local_dinodes);
+
return 0;
}
/**
@@ -292,6 +296,109 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
sdp->sd_lockstruct.ls_ops->lm_recovery_result(sdp, jid, message);
}
+/**
+ * update_statfs_inode - Update the master statfs inode or zero out the local
+ * statfs inode for a given journal.
+ * @jd: The journal
+ * @head: If NULL, @inode is the local statfs inode and we need to zero it out.
+ * Otherwise, it @head contains the statfs change info that needs to be
+ * synced to the master statfs inode (pointed to by @inode).
+ * @inode: statfs inode to update.
+ */
+static int update_statfs_inode(struct gfs2_jdesc *jd,
+ struct gfs2_log_header_host *head,
+ struct inode *inode)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+ struct gfs2_inode *ip;
+ struct buffer_head *bh;
+ struct gfs2_statfs_change_host sc;
+ int error = 0;
+
+ BUG_ON(!inode);
+ ip = GFS2_I(inode);
+
+ error = gfs2_meta_inode_buffer(ip, &bh);
+ if (error)
+ goto out;
+
+ spin_lock(&sdp->sd_statfs_spin);
+
+ if (head) { /* Update the master statfs inode */
+ gfs2_statfs_change_in(&sc, bh->b_data + sizeof(struct gfs2_dinode));
+ sc.sc_total += head->lh_local_total;
+ sc.sc_free += head->lh_local_free;
+ sc.sc_dinodes += head->lh_local_dinodes;
+ gfs2_statfs_change_out(&sc, bh->b_data + sizeof(struct gfs2_dinode));
+
+ fs_info(sdp, "jid=%u: Updated master statfs Total:%lld, "
+ "Free:%lld, Dinodes:%lld after change "
+ "[%+lld,%+lld,%+lld]\n", jd->jd_jid, sc.sc_total,
+ sc.sc_free, sc.sc_dinodes, head->lh_local_total,
+ head->lh_local_free, head->lh_local_dinodes);
+ } else { /* Zero out the local statfs inode */
+ memset(bh->b_data + sizeof(struct gfs2_dinode), 0,
+ sizeof(struct gfs2_statfs_change));
+ /* If it's our own journal, reset any in-memory changes too */
+ if (jd->jd_jid == sdp->sd_lockstruct.ls_jid) {
+ memset(&sdp->sd_statfs_local, 0,
+ sizeof(struct gfs2_statfs_change_host));
+ }
+ }
+ spin_unlock(&sdp->sd_statfs_spin);
+
+ mark_buffer_dirty(bh);
+ brelse(bh);
+ gfs2_meta_sync(ip->i_gl);
+
+out:
+ return error;
+}
+
+/**
+ * recover_local_statfs - Update the master and local statfs changes for this
+ * journal.
+ *
+ * Previously, statfs updates would be read in from the local statfs inode and
+ * synced to the master statfs inode during recovery.
+ *
+ * We now use the statfs updates in the journal head to update the master statfs
+ * inode instead of reading in from the local statfs inode. To preserve backward
+ * compatibility with kernels that can't do this, we still need to keep the
+ * local statfs inode up to date by writing changes to it. At some point in the
+ * future, we can do away with the local statfs inodes altogether and keep the
+ * statfs changes solely in the journal.
+ *
+ * @jd: the journal
+ * @head: the journal head
+ *
+ * Returns: errno
+ */
+static void recover_local_statfs(struct gfs2_jdesc *jd,
+ struct gfs2_log_header_host *head)
+{
+ int error;
+ struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+
+ if (!head->lh_local_total && !head->lh_local_free
+ && !head->lh_local_dinodes) /* No change */
+ goto zero_local;
+
+ /* First update the master statfs inode with the changes we
+ * found in the journal. */
+ error = update_statfs_inode(jd, head, sdp->sd_statfs_inode);
+ if (error)
+ goto out;
+
+zero_local:
+ /* Zero out the local statfs inode so any changes in there
+ * are not re-recovered. */
+ error = update_statfs_inode(jd, NULL,
+ find_local_statfs_inode(sdp, jd->jd_jid));
+out:
+ return;
+}
+
void gfs2_recover_func(struct work_struct *work)
{
struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
@@ -411,6 +518,7 @@ void gfs2_recover_func(struct work_struct *work)
goto fail_gunlock_thaw;
}
+ recover_local_statfs(jd, &head);
clean_journal(jd, &head);
up_read(&sdp->sd_log_flush_lock);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 074f228ea839..ee491bb9c1cc 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -878,7 +878,6 @@ static int rgd_insert(struct gfs2_rgrpd *rgd)
static int read_rindex_entry(struct gfs2_inode *ip)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- const unsigned bsize = sdp->sd_sb.sb_bsize;
loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
struct gfs2_rindex buf;
int error;
@@ -924,9 +923,6 @@ static int read_rindex_entry(struct gfs2_inode *ip)
spin_unlock(&sdp->sd_rindex_spin);
if (!error) {
glock_set_object(rgd->rd_gl, rgd);
- rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_MASK;
- rgd->rd_gl->gl_vm.end = PAGE_ALIGN((rgd->rd_addr +
- rgd->rd_length) * bsize) - 1;
return 0;
}
@@ -2209,20 +2205,17 @@ static void rgblk_free(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd,
/**
* gfs2_rgrp_dump - print out an rgrp
* @seq: The iterator
- * @gl: The glock in question
+ * @rgd: The rgrp in question
* @fs_id_buf: pointer to file system id (if requested)
*
*/
-void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_glock *gl,
+void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd,
const char *fs_id_buf)
{
- struct gfs2_rgrpd *rgd = gl->gl_object;
struct gfs2_blkreserv *trs;
const struct rb_node *n;
- if (rgd == NULL)
- return;
gfs2_print_dbg(seq, "%s R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n",
fs_id_buf,
(unsigned long long)rgd->rd_addr, rgd->rd_flags,
@@ -2253,7 +2246,7 @@ static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
(unsigned long long)rgd->rd_addr);
fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname);
- gfs2_rgrp_dump(NULL, rgd->rd_gl, fs_id_buf);
+ gfs2_rgrp_dump(NULL, rgd, fs_id_buf);
rgd->rd_flags |= GFS2_RDF_ERROR;
}
@@ -2445,8 +2438,8 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
/* Directories keep their data in the metadata address space */
- if (meta || ip->i_depth)
- gfs2_meta_wipe(ip, bstart, blen);
+ if (meta || ip->i_depth || gfs2_is_jdata(ip))
+ gfs2_journal_wipe(ip, bstart, blen);
}
/**
@@ -2502,7 +2495,7 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
gfs2_statfs_change(sdp, 0, +1, -1);
trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid);
- gfs2_meta_wipe(ip, ip->i_no_addr, 1);
+ gfs2_journal_wipe(ip, ip->i_no_addr, 1);
}
/**
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index a1d7e14fc55b..9a587ada51ed 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -67,7 +67,7 @@ extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist);
extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
-extern void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_glock *gl,
+extern void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd,
const char *fs_id_buf);
extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
struct buffer_head *bh,
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 9f4d9e7be839..b285192bd6b3 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -44,6 +44,12 @@
#include "xattr.h"
#include "lops.h"
+enum dinode_demise {
+ SHOULD_DELETE_DINODE,
+ SHOULD_NOT_DELETE_DINODE,
+ SHOULD_DEFER_EVICTION,
+};
+
/**
* gfs2_jindex_free - Clear all the journal index information
* @sdp: The GFS2 superblock
@@ -224,7 +230,7 @@ void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf)
sc->sc_dinodes = be64_to_cpu(str->sc_dinodes);
}
-static void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, void *buf)
+void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, void *buf)
{
struct gfs2_statfs_change *str = buf;
@@ -702,6 +708,8 @@ restart:
if (error)
gfs2_io_error(sdp);
}
+ WARN_ON(gfs2_withdrawing(sdp));
+
/* At this point, we're through modifying the disk */
/* Release stuff */
@@ -721,7 +729,7 @@ restart:
gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
- iput(sdp->sd_sc_inode);
+ free_local_statfs_inodes(sdp);
iput(sdp->sd_qc_inode);
}
@@ -736,6 +744,7 @@ restart:
/* At this point, we're through participating in the lockspace */
gfs2_sys_fs_del(sdp);
+ free_sbd(sdp);
}
/**
@@ -1309,109 +1318,98 @@ static bool gfs2_upgrade_iopen_glock(struct inode *inode)
}
/**
- * gfs2_evict_inode - Remove an inode from cache
+ * evict_should_delete - determine whether the inode is eligible for deletion
* @inode: The inode to evict
*
- * There are three cases to consider:
- * 1. i_nlink == 0, we are final opener (and must deallocate)
- * 2. i_nlink == 0, we are not the final opener (and cannot deallocate)
- * 3. i_nlink > 0
- *
- * If the fs is read only, then we have to treat all cases as per #3
- * since we are unable to do any deallocation. The inode will be
- * deallocated by the next read/write node to attempt an allocation
- * in the same resource group
+ * This function determines whether the evicted inode is eligible to be deleted
+ * and locks the inode glock.
*
- * We have to (at the moment) hold the inodes main lock to cover
- * the gap between unlocking the shared lock on the iopen lock and
- * taking the exclusive lock. I'd rather do a shared -> exclusive
- * conversion on the iopen lock, but we can change that later. This
- * is safe, just less efficient.
+ * Returns: the fate of the dinode
*/
-
-static void gfs2_evict_inode(struct inode *inode)
+static enum dinode_demise evict_should_delete(struct inode *inode,
+ struct gfs2_holder *gh)
{
+ struct gfs2_inode *ip = GFS2_I(inode);
struct super_block *sb = inode->i_sb;
struct gfs2_sbd *sdp = sb->s_fs_info;
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_holder gh;
- struct address_space *metamapping;
- int error;
-
- if (test_bit(GIF_FREE_VFS_INODE, &ip->i_flags)) {
- clear_inode(inode);
- return;
- }
-
- if (inode->i_nlink || sb_rdonly(sb))
- goto out;
+ int ret;
if (test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) {
BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));
- gfs2_holder_mark_uninitialized(&gh);
- goto out_delete;
+ goto should_delete;
}
if (test_bit(GIF_DEFERRED_DELETE, &ip->i_flags))
- goto out;
+ return SHOULD_DEFER_EVICTION;
/* Deletes should never happen under memory pressure anymore. */
if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
- goto out;
+ return SHOULD_DEFER_EVICTION;
/* Must not read inode block until block type has been verified */
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh);
- if (unlikely(error)) {
+ ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, gh);
+ if (unlikely(ret)) {
glock_clear_object(ip->i_iopen_gh.gh_gl, ip);
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
gfs2_glock_dq_uninit(&ip->i_iopen_gh);
- goto out;
+ return SHOULD_DEFER_EVICTION;
}
if (gfs2_inode_already_deleted(ip->i_gl, ip->i_no_formal_ino))
- goto out_truncate;
- error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED);
- if (error)
- goto out_truncate;
+ return SHOULD_NOT_DELETE_DINODE;
+ ret = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED);
+ if (ret)
+ return SHOULD_NOT_DELETE_DINODE;
if (test_bit(GIF_INVALID, &ip->i_flags)) {
- error = gfs2_inode_refresh(ip);
- if (error)
- goto out_truncate;
+ ret = gfs2_inode_refresh(ip);
+ if (ret)
+ return SHOULD_NOT_DELETE_DINODE;
}
/*
* The inode may have been recreated in the meantime.
*/
if (inode->i_nlink)
- goto out_truncate;
+ return SHOULD_NOT_DELETE_DINODE;
-out_delete:
+should_delete:
if (gfs2_holder_initialized(&ip->i_iopen_gh) &&
test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
if (!gfs2_upgrade_iopen_glock(inode)) {
gfs2_holder_uninit(&ip->i_iopen_gh);
- goto out_truncate;
+ return SHOULD_NOT_DELETE_DINODE;
}
}
+ return SHOULD_DELETE_DINODE;
+}
+
+/**
+ * evict_unlinked_inode - delete the pieces of an unlinked evicted inode
+ * @inode: The inode to evict
+ */
+static int evict_unlinked_inode(struct inode *inode)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ int ret;
if (S_ISDIR(inode->i_mode) &&
(ip->i_diskflags & GFS2_DIF_EXHASH)) {
- error = gfs2_dir_exhash_dealloc(ip);
- if (error)
- goto out_unlock;
+ ret = gfs2_dir_exhash_dealloc(ip);
+ if (ret)
+ goto out;
}
if (ip->i_eattr) {
- error = gfs2_ea_dealloc(ip);
- if (error)
- goto out_unlock;
+ ret = gfs2_ea_dealloc(ip);
+ if (ret)
+ goto out;
}
if (!gfs2_is_stuffed(ip)) {
- error = gfs2_file_dealloc(ip);
- if (error)
- goto out_unlock;
+ ret = gfs2_file_dealloc(ip);
+ if (ret)
+ goto out;
}
/* We're about to clear the bitmap for the dinode, but as soon as we
@@ -1419,11 +1417,24 @@ out_delete:
location and try to set gl_object again. We clear gl_object here so
that subsequent inode creates don't see an old gl_object. */
glock_clear_object(ip->i_gl, ip);
- error = gfs2_dinode_dealloc(ip);
+ ret = gfs2_dinode_dealloc(ip);
gfs2_inode_remember_delete(ip->i_gl, ip->i_no_formal_ino);
- goto out_unlock;
+out:
+ return ret;
+}
+
+/*
+ * evict_linked_inode - evict an inode whose dinode has not been unlinked
+ * @inode: The inode to evict
+ */
+static int evict_linked_inode(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+ struct gfs2_sbd *sdp = sb->s_fs_info;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct address_space *metamapping;
+ int ret;
-out_truncate:
gfs2_log_flush(sdp, ip->i_gl, GFS2_LOG_HEAD_FLUSH_NORMAL |
GFS2_LFC_EVICT_INODE);
metamapping = gfs2_glock2aspace(ip->i_gl);
@@ -1434,15 +1445,63 @@ out_truncate:
write_inode_now(inode, 1);
gfs2_ail_flush(ip->i_gl, 0);
- error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
- if (error)
- goto out_unlock;
+ ret = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
+ if (ret)
+ return ret;
+
/* Needs to be done before glock release & also in a transaction */
truncate_inode_pages(&inode->i_data, 0);
truncate_inode_pages(metamapping, 0);
gfs2_trans_end(sdp);
+ return 0;
+}
+
+/**
+ * gfs2_evict_inode - Remove an inode from cache
+ * @inode: The inode to evict
+ *
+ * There are three cases to consider:
+ * 1. i_nlink == 0, we are final opener (and must deallocate)
+ * 2. i_nlink == 0, we are not the final opener (and cannot deallocate)
+ * 3. i_nlink > 0
+ *
+ * If the fs is read only, then we have to treat all cases as per #3
+ * since we are unable to do any deallocation. The inode will be
+ * deallocated by the next read/write node to attempt an allocation
+ * in the same resource group
+ *
+ * We have to (at the moment) hold the inodes main lock to cover
+ * the gap between unlocking the shared lock on the iopen lock and
+ * taking the exclusive lock. I'd rather do a shared -> exclusive
+ * conversion on the iopen lock, but we can change that later. This
+ * is safe, just less efficient.
+ */
+
+static void gfs2_evict_inode(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+ struct gfs2_sbd *sdp = sb->s_fs_info;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder gh;
+ int ret;
+
+ if (test_bit(GIF_FREE_VFS_INODE, &ip->i_flags)) {
+ clear_inode(inode);
+ return;
+ }
+
+ if (inode->i_nlink || sb_rdonly(sb))
+ goto out;
+
+ gfs2_holder_mark_uninitialized(&gh);
+ ret = evict_should_delete(inode, &gh);
+ if (ret == SHOULD_DEFER_EVICTION)
+ goto out;
+ if (ret == SHOULD_DELETE_DINODE)
+ ret = evict_unlinked_inode(inode);
+ else
+ ret = evict_linked_inode(inode);
-out_unlock:
if (gfs2_rs_active(&ip->i_res))
gfs2_rs_deltree(&ip->i_res);
@@ -1450,8 +1509,8 @@ out_unlock:
glock_clear_object(ip->i_gl, ip);
gfs2_glock_dq_uninit(&gh);
}
- if (error && error != GLR_TRYFAILED && error != -EROFS)
- fs_warn(sdp, "gfs2_evict_inode: %d\n", error);
+ if (ret && ret != GLR_TRYFAILED && ret != -EROFS)
+ fs_warn(sdp, "gfs2_evict_inode: %d\n", ret);
out:
truncate_inode_pages_final(&inode->i_data);
if (ip->i_qadata)
@@ -1502,6 +1561,35 @@ static void gfs2_free_inode(struct inode *inode)
kmem_cache_free(gfs2_inode_cachep, GFS2_I(inode));
}
+extern void free_local_statfs_inodes(struct gfs2_sbd *sdp)
+{
+ struct local_statfs_inode *lsi, *safe;
+
+ /* Run through the statfs inodes list to iput and free memory */
+ list_for_each_entry_safe(lsi, safe, &sdp->sd_sc_inodes_list, si_list) {
+ if (lsi->si_jid == sdp->sd_jdesc->jd_jid)
+ sdp->sd_sc_inode = NULL; /* belongs to this node */
+ if (lsi->si_sc_inode)
+ iput(lsi->si_sc_inode);
+ list_del(&lsi->si_list);
+ kfree(lsi);
+ }
+}
+
+extern struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp,
+ unsigned int index)
+{
+ struct local_statfs_inode *lsi;
+
+ /* Return the local (per node) statfs inode in the
+ * sdp->sd_sc_inodes_list corresponding to the 'index'. */
+ list_for_each_entry(lsi, &sdp->sd_sc_inodes_list, si_list) {
+ if (lsi->si_jid == index)
+ return lsi->si_sc_inode;
+ }
+ return NULL;
+}
+
const struct super_operations gfs2_super_ops = {
.alloc_inode = gfs2_alloc_inode,
.free_inode = gfs2_free_inode,
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 51900554ed81..c9fb2a654181 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -37,11 +37,16 @@ extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
s64 dinodes);
extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc,
const void *buf);
+extern void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc,
+ void *buf);
extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
struct buffer_head *l_bh);
extern int gfs2_statfs_sync(struct super_block *sb, int type);
extern void gfs2_freeze_func(struct work_struct *work);
+extern void free_local_statfs_inodes(struct gfs2_sbd *sdp);
+extern struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp,
+ unsigned int index);
extern void free_sbd(struct gfs2_sbd *sdp);
extern struct file_system_type gfs2_fs_type;
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index d28c41bd69b0..c3e72dba7418 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -303,7 +303,7 @@ static void gfs2_sbd_release(struct kobject *kobj)
{
struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
- free_sbd(sdp);
+ complete(&sdp->sd_kobj_unregister);
}
static struct kobj_type gfs2_ktype = {
@@ -655,6 +655,7 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
sprintf(ro, "RDONLY=%d", sb_rdonly(sb));
sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0);
+ init_completion(&sdp->sd_kobj_unregister);
sdp->sd_kobj.kset = gfs2_kset;
error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL,
"%s", sdp->sd_table_name);
@@ -685,6 +686,7 @@ fail_tune:
fail_reg:
fs_err(sdp, "error %d adding sysfs files\n", error);
kobject_put(&sdp->sd_kobj);
+ wait_for_completion(&sdp->sd_kobj_unregister);
sb->s_fs_info = NULL;
return error;
}
@@ -695,6 +697,7 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
sysfs_remove_group(&sdp->sd_kobj, &tune_group);
sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
kobject_put(&sdp->sd_kobj);
+ wait_for_completion(&sdp->sd_kobj_unregister);
}
static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index e0025258107a..0b2f858d9a8c 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -56,7 +56,6 @@
{(1UL << GLF_REPLY_PENDING), "r" }, \
{(1UL << GLF_INITIAL), "I" }, \
{(1UL << GLF_FROZEN), "F" }, \
- {(1UL << GLF_QUEUED), "q" }, \
{(1UL << GLF_LRU), "L" }, \
{(1UL << GLF_OBJECT), "o" }, \
{(1UL << GLF_BLOCKING), "b" })
@@ -388,15 +387,17 @@ TRACE_EVENT(gfs2_log_blocks,
TP_STRUCT__entry(
__field( dev_t, dev )
__field( int, blocks )
+ __field( int, blks_free )
),
TP_fast_assign(
__entry->dev = sdp->sd_vfs->s_dev;
__entry->blocks = blocks;
+ __entry->blks_free = atomic_read(&sdp->sd_log_blks_free);
),
- TP_printk("%u,%u log reserve %d", MAJOR(__entry->dev),
- MINOR(__entry->dev), __entry->blocks)
+ TP_printk("%u,%u log reserve %d %d", MAJOR(__entry->dev),
+ MINOR(__entry->dev), __entry->blocks, __entry->blks_free)
);
/* Writing back the AIL */
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 1cd0328cae20..0fba3bf64189 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -419,7 +419,7 @@ void gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd,
char fs_id_buf[sizeof(sdp->sd_fsname) + 7];
sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname);
- gfs2_rgrp_dump(NULL, rgd->rd_gl, fs_id_buf);
+ gfs2_rgrp_dump(NULL, rgd, fs_id_buf);
gfs2_lm(sdp,
"fatal: filesystem consistency error\n"
" RG = %llu\n"
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 6d9157efe16c..d7562981b3a0 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -205,6 +205,16 @@ static inline bool gfs2_withdrawn(struct gfs2_sbd *sdp)
test_bit(SDF_WITHDRAWING, &sdp->sd_flags);
}
+/**
+ * gfs2_withdrawing - check if a withdraw is pending
+ * @sdp: the superblock
+ */
+static inline bool gfs2_withdrawing(struct gfs2_sbd *sdp)
+{
+ return test_bit(SDF_WITHDRAWING, &sdp->sd_flags) &&
+ !test_bit(SDF_WITHDRAWN, &sdp->sd_flags);
+}
+
#define gfs2_tune_get(sdp, field) \
gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field)
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index c33324686d89..44d07c9e3a7f 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -104,8 +104,7 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail = buf->f_bfree;
buf->f_files = HFS_SB(sb)->fs_ablocks;
buf->f_ffree = HFS_SB(sb)->free_ablocks;
- buf->f_fsid.val[0] = (u32)id;
- buf->f_fsid.val[1] = (u32)(id >> 32);
+ buf->f_fsid = u64_to_fsid(id);
buf->f_namelen = HFS_NAMELEN;
return 0;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 129dca3f4b78..807119ae5adf 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -320,8 +320,7 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail = buf->f_bfree;
buf->f_files = 0xFFFFFFFF;
buf->f_ffree = 0xFFFFFFFF - sbi->next_cnid;
- buf->f_fsid.val[0] = (u32)id;
- buf->f_fsid.val[1] = (u32)(id >> 32);
+ buf->f_fsid = u64_to_fsid(id);
buf->f_namelen = HFSPLUS_MAX_STRLEN;
return 0;
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 0a677a9aaf34..a7dbfc892022 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -192,8 +192,7 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail = sbi->sb_n_free;
buf->f_files = sbi->sb_dirband_size / 4;
buf->f_ffree = hpfs_get_free_dnodes(s);
- buf->f_fsid.val[0] = (u32)id;
- buf->f_fsid.val[1] = (u32)(id >> 32);
+ buf->f_fsid = u64_to_fsid(id);
buf->f_namelen = 254;
hpfs_unlock(s);
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 0a182f1333e8..02894df7656d 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -18,7 +18,10 @@
#include <linux/fs_struct.h>
#include <linux/task_work.h>
#include <linux/blk-cgroup.h>
+#include <linux/audit.h>
+#include <linux/cpu.h>
+#include "../kernel/sched/sched.h"
#include "io-wq.h"
#define WORKER_IDLE_TIMEOUT (5 * HZ)
@@ -122,9 +125,13 @@ struct io_wq {
refcount_t refs;
struct completion done;
+ struct hlist_node cpuhp_node;
+
refcount_t use_refs;
};
+static enum cpuhp_state io_wq_online;
+
static bool io_worker_get(struct io_worker *worker)
{
return refcount_inc_not_zero(&worker->ref);
@@ -186,7 +193,8 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
worker->blkcg_css = NULL;
}
#endif
-
+ if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY)
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
return dropped_lock;
}
@@ -429,14 +437,10 @@ static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work)
mmput(worker->mm);
worker->mm = NULL;
}
- if (!work->mm)
- return;
- if (mmget_not_zero(work->mm)) {
- kthread_use_mm(work->mm);
- worker->mm = work->mm;
- /* hang on to this mm */
- work->mm = NULL;
+ if (mmget_not_zero(work->identity->mm)) {
+ kthread_use_mm(work->identity->mm);
+ worker->mm = work->identity->mm;
return;
}
@@ -448,9 +452,11 @@ static inline void io_wq_switch_blkcg(struct io_worker *worker,
struct io_wq_work *work)
{
#ifdef CONFIG_BLK_CGROUP
- if (work->blkcg_css != worker->blkcg_css) {
- kthread_associate_blkcg(work->blkcg_css);
- worker->blkcg_css = work->blkcg_css;
+ if (!(work->flags & IO_WQ_WORK_BLKCG))
+ return;
+ if (work->identity->blkcg_css != worker->blkcg_css) {
+ kthread_associate_blkcg(work->identity->blkcg_css);
+ worker->blkcg_css = work->identity->blkcg_css;
}
#endif
}
@@ -458,9 +464,9 @@ static inline void io_wq_switch_blkcg(struct io_worker *worker,
static void io_wq_switch_creds(struct io_worker *worker,
struct io_wq_work *work)
{
- const struct cred *old_creds = override_creds(work->creds);
+ const struct cred *old_creds = override_creds(work->identity->creds);
- worker->cur_creds = work->creds;
+ worker->cur_creds = work->identity->creds;
if (worker->saved_creds)
put_cred(old_creds); /* creds set by previous switch */
else
@@ -470,20 +476,29 @@ static void io_wq_switch_creds(struct io_worker *worker,
static void io_impersonate_work(struct io_worker *worker,
struct io_wq_work *work)
{
- if (work->files && current->files != work->files) {
+ if ((work->flags & IO_WQ_WORK_FILES) &&
+ current->files != work->identity->files) {
task_lock(current);
- current->files = work->files;
- current->nsproxy = work->nsproxy;
+ current->files = work->identity->files;
+ current->nsproxy = work->identity->nsproxy;
task_unlock(current);
}
- if (work->fs && current->fs != work->fs)
- current->fs = work->fs;
- if (work->mm != worker->mm)
+ if ((work->flags & IO_WQ_WORK_FS) && current->fs != work->identity->fs)
+ current->fs = work->identity->fs;
+ if ((work->flags & IO_WQ_WORK_MM) && work->identity->mm != worker->mm)
io_wq_switch_mm(worker, work);
- if (worker->cur_creds != work->creds)
+ if ((work->flags & IO_WQ_WORK_CREDS) &&
+ worker->cur_creds != work->identity->creds)
io_wq_switch_creds(worker, work);
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->fsize;
+ if (work->flags & IO_WQ_WORK_FSIZE)
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->identity->fsize;
+ else if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY)
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
io_wq_switch_blkcg(worker, work);
+#ifdef CONFIG_AUDIT
+ current->loginuid = work->identity->loginuid;
+ current->sessionid = work->identity->sessionid;
+#endif
}
static void io_assign_current_work(struct io_worker *worker,
@@ -496,6 +511,11 @@ static void io_assign_current_work(struct io_worker *worker,
cond_resched();
}
+#ifdef CONFIG_AUDIT
+ current->loginuid = KUIDT_INIT(AUDIT_UID_UNSET);
+ current->sessionid = AUDIT_SID_UNSET;
+#endif
+
spin_lock_irq(&worker->lock);
worker->cur_work = work;
spin_unlock_irq(&worker->lock);
@@ -676,6 +696,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
kfree(worker);
return false;
}
+ kthread_bind_mask(worker->task, cpumask_of_node(wqe->node));
raw_spin_lock_irq(&wqe->lock);
hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
@@ -1076,10 +1097,12 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
return ERR_PTR(-ENOMEM);
wq->wqes = kcalloc(nr_node_ids, sizeof(struct io_wqe *), GFP_KERNEL);
- if (!wq->wqes) {
- kfree(wq);
- return ERR_PTR(-ENOMEM);
- }
+ if (!wq->wqes)
+ goto err_wq;
+
+ ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node);
+ if (ret)
+ goto err_wqes;
wq->free_work = data->free_work;
wq->do_work = data->do_work;
@@ -1087,6 +1110,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
/* caller must already hold a reference to this */
wq->user = data->user;
+ ret = -ENOMEM;
for_each_node(node) {
struct io_wqe *wqe;
int alloc_node = node;
@@ -1130,9 +1154,12 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
ret = PTR_ERR(wq->manager);
complete(&wq->done);
err:
+ cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
for_each_node(node)
kfree(wq->wqes[node]);
+err_wqes:
kfree(wq->wqes);
+err_wq:
kfree(wq);
return ERR_PTR(ret);
}
@@ -1149,6 +1176,8 @@ static void __io_wq_destroy(struct io_wq *wq)
{
int node;
+ cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
+
set_bit(IO_WQ_BIT_EXIT, &wq->state);
if (wq->manager)
kthread_stop(wq->manager);
@@ -1176,3 +1205,41 @@ struct task_struct *io_wq_get_task(struct io_wq *wq)
{
return wq->manager;
}
+
+static bool io_wq_worker_affinity(struct io_worker *worker, void *data)
+{
+ struct task_struct *task = worker->task;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(task, &rf);
+ do_set_cpus_allowed(task, cpumask_of_node(worker->wqe->node));
+ task->flags |= PF_NO_SETAFFINITY;
+ task_rq_unlock(rq, task, &rf);
+ return false;
+}
+
+static int io_wq_cpu_online(unsigned int cpu, struct hlist_node *node)
+{
+ struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node);
+ int i;
+
+ rcu_read_lock();
+ for_each_node(i)
+ io_wq_for_each_worker(wq->wqes[i], io_wq_worker_affinity, NULL);
+ rcu_read_unlock();
+ return 0;
+}
+
+static __init int io_wq_init(void)
+{
+ int ret;
+
+ ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "io-wq/online",
+ io_wq_cpu_online, NULL);
+ if (ret < 0)
+ return ret;
+ io_wq_online = ret;
+ return 0;
+}
+subsys_initcall(io_wq_init);
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 84bcf6a85523..cba36f03c355 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -1,6 +1,8 @@
#ifndef INTERNAL_IO_WQ_H
#define INTERNAL_IO_WQ_H
+#include <linux/io_uring.h>
+
struct io_wq;
enum {
@@ -10,6 +12,13 @@ enum {
IO_WQ_WORK_NO_CANCEL = 8,
IO_WQ_WORK_CONCURRENT = 16,
+ IO_WQ_WORK_FILES = 32,
+ IO_WQ_WORK_FS = 64,
+ IO_WQ_WORK_MM = 128,
+ IO_WQ_WORK_CREDS = 256,
+ IO_WQ_WORK_BLKCG = 512,
+ IO_WQ_WORK_FSIZE = 1024,
+
IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
};
@@ -85,15 +94,7 @@ static inline void wq_list_del(struct io_wq_work_list *list,
struct io_wq_work {
struct io_wq_work_node list;
- struct files_struct *files;
- struct mm_struct *mm;
-#ifdef CONFIG_BLK_CGROUP
- struct cgroup_subsys_state *blkcg_css;
-#endif
- const struct cred *creds;
- struct nsproxy *nsproxy;
- struct fs_struct *fs;
- unsigned long fsize;
+ struct io_identity *identity;
unsigned flags;
};
diff --git a/fs/io_uring.c b/fs/io_uring.c
index b58169240c77..b42dfa0243bf 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -81,6 +81,7 @@
#include <linux/pagemap.h>
#include <linux/io_uring.h>
#include <linux/blk-cgroup.h>
+#include <linux/audit.h>
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
@@ -276,7 +277,7 @@ struct io_ring_ctx {
unsigned sq_mask;
unsigned sq_thread_idle;
unsigned cached_sq_dropped;
- atomic_t cached_cq_overflow;
+ unsigned cached_cq_overflow;
unsigned long sq_check_overflow;
struct list_head defer_list;
@@ -327,6 +328,11 @@ struct io_ring_ctx {
const struct cred *creds;
+#ifdef CONFIG_AUDIT
+ kuid_t loginuid;
+ unsigned int sessionid;
+#endif
+
struct completion ref_comp;
struct completion sq_thread_comp;
@@ -574,12 +580,12 @@ enum {
REQ_F_NOWAIT_BIT,
REQ_F_LINK_TIMEOUT_BIT,
REQ_F_ISREG_BIT,
- REQ_F_COMP_LOCKED_BIT,
REQ_F_NEED_CLEANUP_BIT,
REQ_F_POLLED_BIT,
REQ_F_BUFFER_SELECTED_BIT,
REQ_F_NO_FILE_TABLE_BIT,
REQ_F_WORK_INITIALIZED_BIT,
+ REQ_F_LTIMEOUT_ACTIVE_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
@@ -609,12 +615,10 @@ enum {
REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
/* must not punt to workers */
REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
- /* has linked timeout */
+ /* has or had linked timeout */
REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
/* regular file */
REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
- /* completion under lock */
- REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT),
/* needs cleanup */
REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
/* already went through poll handler */
@@ -625,6 +629,8 @@ enum {
REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT),
/* io_wq_work is initialized */
REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT),
+ /* linked timeout is active, i.e. prepared by link's head */
+ REQ_F_LTIMEOUT_ACTIVE = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
};
struct async_poll {
@@ -732,8 +738,6 @@ struct io_submit_state {
};
struct io_op_def {
- /* needs current->mm setup, does mm access */
- unsigned needs_mm : 1;
/* needs req->file assigned */
unsigned needs_file : 1;
/* don't fail if file grab fails */
@@ -744,67 +748,58 @@ struct io_op_def {
unsigned unbound_nonreg_file : 1;
/* opcode is not supported by this kernel */
unsigned not_supported : 1;
- /* needs file table */
- unsigned file_table : 1;
- /* needs ->fs */
- unsigned needs_fs : 1;
/* set if opcode supports polled "wait" */
unsigned pollin : 1;
unsigned pollout : 1;
/* op supports buffer selection */
unsigned buffer_select : 1;
- /* needs rlimit(RLIMIT_FSIZE) assigned */
- unsigned needs_fsize : 1;
/* must always have async data allocated */
unsigned needs_async_data : 1;
- /* needs blkcg context, issues async io potentially */
- unsigned needs_blkcg : 1;
/* size of async data needed, if any */
unsigned short async_size;
+ unsigned work_flags;
};
-static const struct io_op_def io_op_defs[] __read_mostly = {
+static const struct io_op_def io_op_defs[] = {
[IORING_OP_NOP] = {},
[IORING_OP_READV] = {
- .needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
.needs_async_data = 1,
- .needs_blkcg = 1,
.async_size = sizeof(struct io_async_rw),
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_WRITEV] = {
- .needs_mm = 1,
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
- .needs_fsize = 1,
.needs_async_data = 1,
- .needs_blkcg = 1,
.async_size = sizeof(struct io_async_rw),
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
+ IO_WQ_WORK_FSIZE,
},
[IORING_OP_FSYNC] = {
.needs_file = 1,
- .needs_blkcg = 1,
+ .work_flags = IO_WQ_WORK_BLKCG,
},
[IORING_OP_READ_FIXED] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
- .needs_blkcg = 1,
.async_size = sizeof(struct io_async_rw),
+ .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
},
[IORING_OP_WRITE_FIXED] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
- .needs_fsize = 1,
- .needs_blkcg = 1,
.async_size = sizeof(struct io_async_rw),
+ .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE |
+ IO_WQ_WORK_MM,
},
[IORING_OP_POLL_ADD] = {
.needs_file = 1,
@@ -813,137 +808,122 @@ static const struct io_op_def io_op_defs[] __read_mostly = {
[IORING_OP_POLL_REMOVE] = {},
[IORING_OP_SYNC_FILE_RANGE] = {
.needs_file = 1,
- .needs_blkcg = 1,
+ .work_flags = IO_WQ_WORK_BLKCG,
},
[IORING_OP_SENDMSG] = {
- .needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
- .needs_fs = 1,
.pollout = 1,
.needs_async_data = 1,
- .needs_blkcg = 1,
.async_size = sizeof(struct io_async_msghdr),
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
+ IO_WQ_WORK_FS,
},
[IORING_OP_RECVMSG] = {
- .needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
- .needs_fs = 1,
.pollin = 1,
.buffer_select = 1,
.needs_async_data = 1,
- .needs_blkcg = 1,
.async_size = sizeof(struct io_async_msghdr),
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
+ IO_WQ_WORK_FS,
},
[IORING_OP_TIMEOUT] = {
- .needs_mm = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_timeout_data),
+ .work_flags = IO_WQ_WORK_MM,
},
[IORING_OP_TIMEOUT_REMOVE] = {},
[IORING_OP_ACCEPT] = {
- .needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
- .file_table = 1,
.pollin = 1,
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES,
},
[IORING_OP_ASYNC_CANCEL] = {},
[IORING_OP_LINK_TIMEOUT] = {
- .needs_mm = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_timeout_data),
+ .work_flags = IO_WQ_WORK_MM,
},
[IORING_OP_CONNECT] = {
- .needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_async_connect),
+ .work_flags = IO_WQ_WORK_MM,
},
[IORING_OP_FALLOCATE] = {
.needs_file = 1,
- .needs_fsize = 1,
- .needs_blkcg = 1,
+ .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE,
},
[IORING_OP_OPENAT] = {
- .file_table = 1,
- .needs_fs = 1,
- .needs_blkcg = 1,
+ .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG |
+ IO_WQ_WORK_FS,
},
[IORING_OP_CLOSE] = {
.needs_file = 1,
.needs_file_no_error = 1,
- .file_table = 1,
- .needs_blkcg = 1,
+ .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG,
},
[IORING_OP_FILES_UPDATE] = {
- .needs_mm = 1,
- .file_table = 1,
+ .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_MM,
},
[IORING_OP_STATX] = {
- .needs_mm = 1,
- .needs_fs = 1,
- .file_table = 1,
- .needs_blkcg = 1,
+ .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_MM |
+ IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
},
[IORING_OP_READ] = {
- .needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
- .needs_blkcg = 1,
.async_size = sizeof(struct io_async_rw),
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_WRITE] = {
- .needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
- .needs_fsize = 1,
- .needs_blkcg = 1,
.async_size = sizeof(struct io_async_rw),
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
+ IO_WQ_WORK_FSIZE,
},
[IORING_OP_FADVISE] = {
.needs_file = 1,
- .needs_blkcg = 1,
+ .work_flags = IO_WQ_WORK_BLKCG,
},
[IORING_OP_MADVISE] = {
- .needs_mm = 1,
- .needs_blkcg = 1,
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_SEND] = {
- .needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
- .needs_blkcg = 1,
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_RECV] = {
- .needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
- .needs_blkcg = 1,
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_OPENAT2] = {
- .file_table = 1,
- .needs_fs = 1,
- .needs_blkcg = 1,
+ .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_FS |
+ IO_WQ_WORK_BLKCG,
},
[IORING_OP_EPOLL_CTL] = {
.unbound_nonreg_file = 1,
- .file_table = 1,
+ .work_flags = IO_WQ_WORK_FILES,
},
[IORING_OP_SPLICE] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
- .needs_blkcg = 1,
+ .work_flags = IO_WQ_WORK_BLKCG,
},
[IORING_OP_PROVIDE_BUFFERS] = {},
[IORING_OP_REMOVE_BUFFERS] = {},
@@ -963,8 +943,8 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
struct io_comp_state *cs);
static void io_cqring_fill_event(struct io_kiocb *req, long res);
static void io_put_req(struct io_kiocb *req);
+static void io_put_req_deferred(struct io_kiocb *req, int nr);
static void io_double_put_req(struct io_kiocb *req);
-static void __io_double_put_req(struct io_kiocb *req);
static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
static void __io_queue_linked_timeout(struct io_kiocb *req);
static void io_queue_linked_timeout(struct io_kiocb *req);
@@ -986,7 +966,7 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
static struct kmem_cache *req_cachep;
-static const struct file_operations io_uring_fops __read_mostly;
+static const struct file_operations io_uring_fops;
struct sock *io_uring_get_socket(struct file *file)
{
@@ -1034,7 +1014,7 @@ static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
struct io_kiocb *req)
{
- if (!io_op_defs[req->opcode].needs_mm)
+ if (!(io_op_defs[req->opcode].work_flags & IO_WQ_WORK_MM))
return 0;
return __io_sq_thread_acquire_mm(ctx);
}
@@ -1066,16 +1046,53 @@ static inline void req_set_fail_links(struct io_kiocb *req)
}
/*
+ * None of these are dereferenced, they are simply used to check if any of
+ * them have changed. If we're under current and check they are still the
+ * same, we're fine to grab references to them for actual out-of-line use.
+ */
+static void io_init_identity(struct io_identity *id)
+{
+ id->files = current->files;
+ id->mm = current->mm;
+#ifdef CONFIG_BLK_CGROUP
+ rcu_read_lock();
+ id->blkcg_css = blkcg_css();
+ rcu_read_unlock();
+#endif
+ id->creds = current_cred();
+ id->nsproxy = current->nsproxy;
+ id->fs = current->fs;
+ id->fsize = rlimit(RLIMIT_FSIZE);
+#ifdef CONFIG_AUDIT
+ id->loginuid = current->loginuid;
+ id->sessionid = current->sessionid;
+#endif
+ refcount_set(&id->count, 1);
+}
+
+static inline void __io_req_init_async(struct io_kiocb *req)
+{
+ memset(&req->work, 0, sizeof(req->work));
+ req->flags |= REQ_F_WORK_INITIALIZED;
+}
+
+/*
* Note: must call io_req_init_async() for the first time you
* touch any members of io_wq_work.
*/
static inline void io_req_init_async(struct io_kiocb *req)
{
+ struct io_uring_task *tctx = current->io_uring;
+
if (req->flags & REQ_F_WORK_INITIALIZED)
return;
- memset(&req->work, 0, sizeof(req->work));
- req->flags |= REQ_F_WORK_INITIALIZED;
+ __io_req_init_async(req);
+
+ /* Grab a ref if this isn't our static identity */
+ req->work.identity = tctx->identity;
+ if (tctx->identity != &tctx->__identity)
+ refcount_inc(&req->work.identity->count);
}
static inline bool io_async_submit(struct io_ring_ctx *ctx)
@@ -1162,7 +1179,7 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
struct io_ring_ctx *ctx = req->ctx;
return seq != ctx->cached_cq_tail
- + atomic_read(&ctx->cached_cq_overflow);
+ + READ_ONCE(ctx->cached_cq_overflow);
}
return false;
@@ -1181,105 +1198,198 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx)
}
}
-/*
- * Returns true if we need to defer file table putting. This can only happen
- * from the error path with REQ_F_COMP_LOCKED set.
- */
-static bool io_req_clean_work(struct io_kiocb *req)
+static void io_put_identity(struct io_uring_task *tctx, struct io_kiocb *req)
+{
+ if (req->work.identity == &tctx->__identity)
+ return;
+ if (refcount_dec_and_test(&req->work.identity->count))
+ kfree(req->work.identity);
+}
+
+static void io_req_clean_work(struct io_kiocb *req)
{
if (!(req->flags & REQ_F_WORK_INITIALIZED))
- return false;
+ return;
req->flags &= ~REQ_F_WORK_INITIALIZED;
- if (req->work.mm) {
- mmdrop(req->work.mm);
- req->work.mm = NULL;
+ if (req->work.flags & IO_WQ_WORK_MM) {
+ mmdrop(req->work.identity->mm);
+ req->work.flags &= ~IO_WQ_WORK_MM;
}
#ifdef CONFIG_BLK_CGROUP
- if (req->work.blkcg_css)
- css_put(req->work.blkcg_css);
+ if (req->work.flags & IO_WQ_WORK_BLKCG) {
+ css_put(req->work.identity->blkcg_css);
+ req->work.flags &= ~IO_WQ_WORK_BLKCG;
+ }
#endif
- if (req->work.creds) {
- put_cred(req->work.creds);
- req->work.creds = NULL;
+ if (req->work.flags & IO_WQ_WORK_CREDS) {
+ put_cred(req->work.identity->creds);
+ req->work.flags &= ~IO_WQ_WORK_CREDS;
}
- if (req->work.fs) {
- struct fs_struct *fs = req->work.fs;
-
- if (req->flags & REQ_F_COMP_LOCKED)
- return true;
+ if (req->work.flags & IO_WQ_WORK_FS) {
+ struct fs_struct *fs = req->work.identity->fs;
- spin_lock(&req->work.fs->lock);
+ spin_lock(&req->work.identity->fs->lock);
if (--fs->users)
fs = NULL;
- spin_unlock(&req->work.fs->lock);
+ spin_unlock(&req->work.identity->fs->lock);
if (fs)
free_fs_struct(fs);
- req->work.fs = NULL;
+ req->work.flags &= ~IO_WQ_WORK_FS;
}
- return false;
+ io_put_identity(req->task->io_uring, req);
}
-static void io_prep_async_work(struct io_kiocb *req)
+/*
+ * Create a private copy of io_identity, since some fields don't match
+ * the current context.
+ */
+static bool io_identity_cow(struct io_kiocb *req)
+{
+ struct io_uring_task *tctx = current->io_uring;
+ const struct cred *creds = NULL;
+ struct io_identity *id;
+
+ if (req->work.flags & IO_WQ_WORK_CREDS)
+ creds = req->work.identity->creds;
+
+ id = kmemdup(req->work.identity, sizeof(*id), GFP_KERNEL);
+ if (unlikely(!id)) {
+ req->work.flags |= IO_WQ_WORK_CANCEL;
+ return false;
+ }
+
+ /*
+ * We can safely just re-init the creds we copied Either the field
+ * matches the current one, or we haven't grabbed it yet. The only
+ * exception is ->creds, through registered personalities, so handle
+ * that one separately.
+ */
+ io_init_identity(id);
+ if (creds)
+ req->work.identity->creds = creds;
+
+ /* add one for this request */
+ refcount_inc(&id->count);
+
+ /* drop old identity, assign new one. one ref for req, one for tctx */
+ if (req->work.identity != tctx->identity &&
+ refcount_sub_and_test(2, &req->work.identity->count))
+ kfree(req->work.identity);
+
+ req->work.identity = id;
+ tctx->identity = id;
+ return true;
+}
+
+static bool io_grab_identity(struct io_kiocb *req)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
+ struct io_identity *id = req->work.identity;
struct io_ring_ctx *ctx = req->ctx;
- io_req_init_async(req);
-
- if (req->flags & REQ_F_ISREG) {
- if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
- io_wq_hash_work(&req->work, file_inode(req->file));
- } else {
- if (def->unbound_nonreg_file)
- req->work.flags |= IO_WQ_WORK_UNBOUND;
+ if (def->work_flags & IO_WQ_WORK_FSIZE) {
+ if (id->fsize != rlimit(RLIMIT_FSIZE))
+ return false;
+ req->work.flags |= IO_WQ_WORK_FSIZE;
}
- if (!req->work.files && io_op_defs[req->opcode].file_table &&
+
+ if (!(req->work.flags & IO_WQ_WORK_FILES) &&
+ (def->work_flags & IO_WQ_WORK_FILES) &&
!(req->flags & REQ_F_NO_FILE_TABLE)) {
- req->work.files = get_files_struct(current);
- get_nsproxy(current->nsproxy);
- req->work.nsproxy = current->nsproxy;
+ if (id->files != current->files ||
+ id->nsproxy != current->nsproxy)
+ return false;
+ atomic_inc(&id->files->count);
+ get_nsproxy(id->nsproxy);
req->flags |= REQ_F_INFLIGHT;
spin_lock_irq(&ctx->inflight_lock);
list_add(&req->inflight_entry, &ctx->inflight_list);
spin_unlock_irq(&ctx->inflight_lock);
- }
- if (!req->work.mm && def->needs_mm) {
- mmgrab(current->mm);
- req->work.mm = current->mm;
+ req->work.flags |= IO_WQ_WORK_FILES;
}
#ifdef CONFIG_BLK_CGROUP
- if (!req->work.blkcg_css && def->needs_blkcg) {
+ if (!(req->work.flags & IO_WQ_WORK_BLKCG) &&
+ (def->work_flags & IO_WQ_WORK_BLKCG)) {
rcu_read_lock();
- req->work.blkcg_css = blkcg_css();
+ if (id->blkcg_css != blkcg_css()) {
+ rcu_read_unlock();
+ return false;
+ }
/*
* This should be rare, either the cgroup is dying or the task
* is moving cgroups. Just punt to root for the handful of ios.
*/
- if (!css_tryget_online(req->work.blkcg_css))
- req->work.blkcg_css = NULL;
+ if (css_tryget_online(id->blkcg_css))
+ req->work.flags |= IO_WQ_WORK_BLKCG;
rcu_read_unlock();
}
#endif
- if (!req->work.creds)
- req->work.creds = get_current_cred();
- if (!req->work.fs && def->needs_fs) {
- spin_lock(&current->fs->lock);
- if (!current->fs->in_exec) {
- req->work.fs = current->fs;
- req->work.fs->users++;
+ if (!(req->work.flags & IO_WQ_WORK_CREDS)) {
+ if (id->creds != current_cred())
+ return false;
+ get_cred(id->creds);
+ req->work.flags |= IO_WQ_WORK_CREDS;
+ }
+#ifdef CONFIG_AUDIT
+ if (!uid_eq(current->loginuid, id->loginuid) ||
+ current->sessionid != id->sessionid)
+ return false;
+#endif
+ if (!(req->work.flags & IO_WQ_WORK_FS) &&
+ (def->work_flags & IO_WQ_WORK_FS)) {
+ if (current->fs != id->fs)
+ return false;
+ spin_lock(&id->fs->lock);
+ if (!id->fs->in_exec) {
+ id->fs->users++;
+ req->work.flags |= IO_WQ_WORK_FS;
} else {
req->work.flags |= IO_WQ_WORK_CANCEL;
}
spin_unlock(&current->fs->lock);
}
- if (def->needs_fsize)
- req->work.fsize = rlimit(RLIMIT_FSIZE);
- else
- req->work.fsize = RLIM_INFINITY;
+
+ return true;
+}
+
+static void io_prep_async_work(struct io_kiocb *req)
+{
+ const struct io_op_def *def = &io_op_defs[req->opcode];
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_identity *id;
+
+ io_req_init_async(req);
+ id = req->work.identity;
+
+ if (req->flags & REQ_F_ISREG) {
+ if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
+ io_wq_hash_work(&req->work, file_inode(req->file));
+ } else {
+ if (def->unbound_nonreg_file)
+ req->work.flags |= IO_WQ_WORK_UNBOUND;
+ }
+
+ /* ->mm can never change on us */
+ if (!(req->work.flags & IO_WQ_WORK_MM) &&
+ (def->work_flags & IO_WQ_WORK_MM)) {
+ mmgrab(id->mm);
+ req->work.flags |= IO_WQ_WORK_MM;
+ }
+
+ /* if we fail grabbing identity, we must COW, regrab, and retry */
+ if (io_grab_identity(req))
+ return;
+
+ if (!io_identity_cow(req))
+ return;
+
+ /* can't fail at this point */
+ if (!io_grab_identity(req))
+ WARN_ON(1);
}
static void io_prep_async_link(struct io_kiocb *req)
@@ -1325,9 +1435,8 @@ static void io_kill_timeout(struct io_kiocb *req)
atomic_set(&req->ctx->cq_timeouts,
atomic_read(&req->ctx->cq_timeouts) + 1);
list_del_init(&req->timeout.list);
- req->flags |= REQ_F_COMP_LOCKED;
io_cqring_fill_event(req, 0);
- io_put_req(req);
+ io_put_req_deferred(req, 1);
}
}
@@ -1378,8 +1487,7 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx)
if (link) {
__io_queue_linked_timeout(link);
/* drop submission reference */
- link->flags |= REQ_F_COMP_LOCKED;
- io_put_req(link);
+ io_put_req_deferred(link, 1);
}
kfree(de);
} while (!list_empty(&ctx->defer_list));
@@ -1471,8 +1579,9 @@ static inline bool io_match_files(struct io_kiocb *req,
{
if (!files)
return true;
- if (req->flags & REQ_F_WORK_INITIALIZED)
- return req->work.files == files;
+ if ((req->flags & REQ_F_WORK_INITIALIZED) &&
+ (req->work.flags & IO_WQ_WORK_FILES))
+ return req->work.identity->files == files;
return false;
}
@@ -1518,8 +1627,9 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
WRITE_ONCE(cqe->res, req->result);
WRITE_ONCE(cqe->flags, req->compl.cflags);
} else {
+ ctx->cached_cq_overflow++;
WRITE_ONCE(ctx->rings->cq_overflow,
- atomic_inc_return(&ctx->cached_cq_overflow));
+ ctx->cached_cq_overflow);
}
}
@@ -1561,8 +1671,8 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
* then we cannot store the request for later flushing, we need
* to drop it on the floor.
*/
- WRITE_ONCE(ctx->rings->cq_overflow,
- atomic_inc_return(&ctx->cached_cq_overflow));
+ ctx->cached_cq_overflow++;
+ WRITE_ONCE(ctx->rings->cq_overflow, ctx->cached_cq_overflow);
} else {
if (list_empty(&ctx->cq_overflow_list)) {
set_bit(0, &ctx->sq_check_overflow);
@@ -1606,13 +1716,19 @@ static void io_submit_flush_completions(struct io_comp_state *cs)
req = list_first_entry(&cs->list, struct io_kiocb, compl.list);
list_del(&req->compl.list);
__io_cqring_fill_event(req, req->result, req->compl.cflags);
- if (!(req->flags & REQ_F_LINK_HEAD)) {
- req->flags |= REQ_F_COMP_LOCKED;
- io_put_req(req);
- } else {
+
+ /*
+ * io_free_req() doesn't care about completion_lock unless one
+ * of these flags is set. REQ_F_WORK_INITIALIZED is in the list
+ * because of a potential deadlock with req->work.fs->lock
+ */
+ if (req->flags & (REQ_F_FAIL_LINK|REQ_F_LINK_TIMEOUT
+ |REQ_F_WORK_INITIALIZED)) {
spin_unlock_irq(&ctx->completion_lock);
io_put_req(req);
spin_lock_irq(&ctx->completion_lock);
+ } else {
+ io_put_req(req);
}
}
io_commit_cqring(ctx);
@@ -1699,7 +1815,7 @@ static inline void io_put_file(struct io_kiocb *req, struct file *file,
fput(file);
}
-static bool io_dismantle_req(struct io_kiocb *req)
+static void io_dismantle_req(struct io_kiocb *req)
{
io_clean_op(req);
@@ -1708,15 +1824,17 @@ static bool io_dismantle_req(struct io_kiocb *req)
if (req->file)
io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
- return io_req_clean_work(req);
+ io_req_clean_work(req);
}
-static void __io_free_req_finish(struct io_kiocb *req)
+static void __io_free_req(struct io_kiocb *req)
{
struct io_uring_task *tctx = req->task->io_uring;
struct io_ring_ctx *ctx = req->ctx;
- atomic_long_inc(&tctx->req_complete);
+ io_dismantle_req(req);
+
+ percpu_counter_dec(&tctx->inflight);
if (tctx->in_idle)
wake_up(&tctx->wait);
put_task_struct(req->task);
@@ -1728,39 +1846,6 @@ static void __io_free_req_finish(struct io_kiocb *req)
percpu_ref_put(&ctx->refs);
}
-static void io_req_task_file_table_put(struct callback_head *cb)
-{
- struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
- struct fs_struct *fs = req->work.fs;
-
- spin_lock(&req->work.fs->lock);
- if (--fs->users)
- fs = NULL;
- spin_unlock(&req->work.fs->lock);
- if (fs)
- free_fs_struct(fs);
- req->work.fs = NULL;
- __io_free_req_finish(req);
-}
-
-static void __io_free_req(struct io_kiocb *req)
-{
- if (!io_dismantle_req(req)) {
- __io_free_req_finish(req);
- } else {
- int ret;
-
- init_task_work(&req->task_work, io_req_task_file_table_put);
- ret = task_work_add(req->task, &req->task_work, TWA_RESUME);
- if (unlikely(ret)) {
- struct task_struct *tsk;
-
- tsk = io_wq_get_task(req->ctx->io_wq);
- task_work_add(tsk, &req->task_work, 0);
- }
- }
-}
-
static bool io_link_cancel_timeout(struct io_kiocb *req)
{
struct io_timeout_data *io = req->async_data;
@@ -1772,7 +1857,7 @@ static bool io_link_cancel_timeout(struct io_kiocb *req)
io_cqring_fill_event(req, -ECANCELED);
io_commit_cqring(ctx);
req->flags &= ~REQ_F_LINK_HEAD;
- io_put_req(req);
+ io_put_req_deferred(req, 1);
return true;
}
@@ -1789,9 +1874,14 @@ static bool __io_kill_linked_timeout(struct io_kiocb *req)
link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
if (link->opcode != IORING_OP_LINK_TIMEOUT)
return false;
+ /*
+ * Can happen if a linked timeout fired and link had been like
+ * req -> link t-out -> link t-out [-> ...]
+ */
+ if (!(link->flags & REQ_F_LTIMEOUT_ACTIVE))
+ return false;
list_del_init(&link->link_list);
- link->flags |= REQ_F_COMP_LOCKED;
wake_ev = io_link_cancel_timeout(link);
req->flags &= ~REQ_F_LINK_TIMEOUT;
return wake_ev;
@@ -1800,17 +1890,12 @@ static bool __io_kill_linked_timeout(struct io_kiocb *req)
static void io_kill_linked_timeout(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
+ unsigned long flags;
bool wake_ev;
- if (!(req->flags & REQ_F_COMP_LOCKED)) {
- unsigned long flags;
-
- spin_lock_irqsave(&ctx->completion_lock, flags);
- wake_ev = __io_kill_linked_timeout(req);
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
- } else {
- wake_ev = __io_kill_linked_timeout(req);
- }
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ wake_ev = __io_kill_linked_timeout(req);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
if (wake_ev)
io_cqring_ev_posted(ctx);
@@ -1838,10 +1923,12 @@ static struct io_kiocb *io_req_link_next(struct io_kiocb *req)
/*
* Called if REQ_F_LINK_HEAD is set, and we fail the head request
*/
-static void __io_fail_links(struct io_kiocb *req)
+static void io_fail_links(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
+ unsigned long flags;
+ spin_lock_irqsave(&ctx->completion_lock, flags);
while (!list_empty(&req->link_list)) {
struct io_kiocb *link = list_first_entry(&req->link_list,
struct io_kiocb, link_list);
@@ -1850,28 +1937,20 @@ static void __io_fail_links(struct io_kiocb *req)
trace_io_uring_fail_link(req, link);
io_cqring_fill_event(link, -ECANCELED);
- link->flags |= REQ_F_COMP_LOCKED;
- __io_double_put_req(link);
- req->flags &= ~REQ_F_LINK_TIMEOUT;
+
+ /*
+ * It's ok to free under spinlock as they're not linked anymore,
+ * but avoid REQ_F_WORK_INITIALIZED because it may deadlock on
+ * work.fs->lock.
+ */
+ if (link->flags & REQ_F_WORK_INITIALIZED)
+ io_put_req_deferred(link, 2);
+ else
+ io_double_put_req(link);
}
io_commit_cqring(ctx);
- io_cqring_ev_posted(ctx);
-}
-
-static void io_fail_links(struct io_kiocb *req)
-{
- struct io_ring_ctx *ctx = req->ctx;
-
- if (!(req->flags & REQ_F_COMP_LOCKED)) {
- unsigned long flags;
-
- spin_lock_irqsave(&ctx->completion_lock, flags);
- __io_fail_links(req);
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
- } else {
- __io_fail_links(req);
- }
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
}
@@ -1905,7 +1984,8 @@ static int io_req_task_work_add(struct io_kiocb *req, bool twa_signal_ok)
{
struct task_struct *tsk = req->task;
struct io_ring_ctx *ctx = req->ctx;
- int ret, notify;
+ enum task_work_notify_mode notify;
+ int ret;
if (tsk->flags & PF_EXITING)
return -ESRCH;
@@ -1916,7 +1996,7 @@ static int io_req_task_work_add(struct io_kiocb *req, bool twa_signal_ok)
* processing task_work. There's no reliable way to tell if TWA_RESUME
* will do the job.
*/
- notify = 0;
+ notify = TWA_NONE;
if (!(ctx->flags & IORING_SETUP_SQPOLL) && twa_signal_ok)
notify = TWA_SIGNAL;
@@ -1985,7 +2065,7 @@ static void io_req_task_queue(struct io_kiocb *req)
init_task_work(&req->task_work, io_req_task_cancel);
tsk = io_wq_get_task(req->ctx->io_wq);
- task_work_add(tsk, &req->task_work, 0);
+ task_work_add(tsk, &req->task_work, TWA_NONE);
wake_up_process(tsk);
}
}
@@ -2033,7 +2113,9 @@ static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
if (rb->to_free)
__io_req_free_batch_flush(ctx, rb);
if (rb->task) {
- atomic_long_add(rb->task_refs, &rb->task->io_uring->req_complete);
+ struct io_uring_task *tctx = rb->task->io_uring;
+
+ percpu_counter_sub(&tctx->inflight, rb->task_refs);
put_task_struct_many(rb->task, rb->task_refs);
rb->task = NULL;
}
@@ -2050,7 +2132,9 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req)
if (req->task != rb->task) {
if (rb->task) {
- atomic_long_add(rb->task_refs, &rb->task->io_uring->req_complete);
+ struct io_uring_task *tctx = rb->task->io_uring;
+
+ percpu_counter_sub(&tctx->inflight, rb->task_refs);
put_task_struct_many(rb->task, rb->task_refs);
}
rb->task = req->task;
@@ -2058,7 +2142,7 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req)
}
rb->task_refs++;
- WARN_ON_ONCE(io_dismantle_req(req));
+ io_dismantle_req(req);
rb->reqs[rb->to_free++] = req;
if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
__io_req_free_batch_flush(req->ctx, rb);
@@ -2085,6 +2169,34 @@ static void io_put_req(struct io_kiocb *req)
io_free_req(req);
}
+static void io_put_req_deferred_cb(struct callback_head *cb)
+{
+ struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+
+ io_free_req(req);
+}
+
+static void io_free_req_deferred(struct io_kiocb *req)
+{
+ int ret;
+
+ init_task_work(&req->task_work, io_put_req_deferred_cb);
+ ret = io_req_task_work_add(req, true);
+ if (unlikely(ret)) {
+ struct task_struct *tsk;
+
+ tsk = io_wq_get_task(req->ctx->io_wq);
+ task_work_add(tsk, &req->task_work, TWA_NONE);
+ wake_up_process(tsk);
+ }
+}
+
+static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
+{
+ if (refcount_sub_and_test(refs, &req->refs))
+ io_free_req_deferred(req);
+}
+
static struct io_wq_work *io_steal_work(struct io_kiocb *req)
{
struct io_kiocb *nxt;
@@ -2101,17 +2213,6 @@ static struct io_wq_work *io_steal_work(struct io_kiocb *req)
return nxt ? &nxt->work : NULL;
}
-/*
- * Must only be used if we don't need to care about links, usually from
- * within the completion handling itself.
- */
-static void __io_double_put_req(struct io_kiocb *req)
-{
- /* drop both submit and complete references */
- if (refcount_sub_and_test(2, &req->refs))
- __io_free_req(req);
-}
-
static void io_double_put_req(struct io_kiocb *req)
{
/* drop both submit and complete references */
@@ -2601,7 +2702,7 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd)
static bool io_bdev_nowait(struct block_device *bdev)
{
#ifdef CONFIG_BLOCK
- return !bdev || queue_is_mq(bdev_get_queue(bdev));
+ return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
#else
return true;
#endif
@@ -3016,9 +3117,10 @@ static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
* For files that don't have ->read_iter() and ->write_iter(), handle them
* by looping over ->read() or ->write() manually.
*/
-static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
- struct iov_iter *iter)
+static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
{
+ struct kiocb *kiocb = &req->rw.kiocb;
+ struct file *file = req->file;
ssize_t ret = 0;
/*
@@ -3038,11 +3140,8 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
if (!iov_iter_is_bvec(iter)) {
iovec = iov_iter_iovec(iter);
} else {
- /* fixed buffers import bvec */
- iovec.iov_base = kmap(iter->bvec->bv_page)
- + iter->iov_offset;
- iovec.iov_len = min(iter->count,
- iter->bvec->bv_len - iter->iov_offset);
+ iovec.iov_base = u64_to_user_ptr(req->rw.addr);
+ iovec.iov_len = req->rw.len;
}
if (rw == READ) {
@@ -3053,9 +3152,6 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
iovec.iov_len, io_kiocb_ppos(kiocb));
}
- if (iov_iter_is_bvec(iter))
- kunmap(iter->bvec->bv_page);
-
if (nr < 0) {
if (!ret)
ret = nr;
@@ -3064,6 +3160,8 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
ret += nr;
if (nr != iovec.iov_len)
break;
+ req->rw.len -= nr;
+ req->rw.addr += nr;
iov_iter_advance(iter, nr);
}
@@ -3199,7 +3297,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
/* queue just for cancelation */
init_task_work(&req->task_work, io_req_task_cancel);
tsk = io_wq_get_task(req->ctx->io_wq);
- task_work_add(tsk, &req->task_work, 0);
+ task_work_add(tsk, &req->task_work, TWA_NONE);
wake_up_process(tsk);
}
return 1;
@@ -3253,7 +3351,7 @@ static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
if (req->file->f_op->read_iter)
return call_read_iter(req->file, &req->rw.kiocb, iter);
else if (req->file->f_op->read)
- return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter);
+ return loop_rw_iter(READ, req, iter);
else
return -EINVAL;
}
@@ -3444,7 +3542,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
if (req->file->f_op->write_iter)
ret2 = call_write_iter(req->file, kiocb, iter);
else if (req->file->f_op->write)
- ret2 = loop_rw_iter(WRITE, req->file, kiocb, iter);
+ ret2 = loop_rw_iter(WRITE, req, iter);
else
ret2 = -EINVAL;
@@ -4123,7 +4221,7 @@ static int io_close(struct io_kiocb *req, bool force_nonblock,
}
/* No ->flush() or already async, safely close from here */
- ret = filp_close(close->put_file, req->work.files);
+ ret = filp_close(close->put_file, req->work.identity->files);
if (ret < 0)
req_set_fail_links(req);
fput(close->put_file);
@@ -4765,7 +4863,7 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
WRITE_ONCE(poll->canceled, true);
tsk = io_wq_get_task(req->ctx->io_wq);
- task_work_add(tsk, &req->task_work, 0);
+ task_work_add(tsk, &req->task_work, TWA_NONE);
wake_up_process(tsk);
}
return 1;
@@ -4834,33 +4932,25 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
io_commit_cqring(ctx);
}
-static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
+static void io_poll_task_func(struct callback_head *cb)
{
+ struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
struct io_ring_ctx *ctx = req->ctx;
+ struct io_kiocb *nxt;
if (io_poll_rewait(req, &req->poll)) {
spin_unlock_irq(&ctx->completion_lock);
- return;
- }
-
- hash_del(&req->hash_node);
- io_poll_complete(req, req->result, 0);
- req->flags |= REQ_F_COMP_LOCKED;
- *nxt = io_put_req_find_next(req);
- spin_unlock_irq(&ctx->completion_lock);
-
- io_cqring_ev_posted(ctx);
-}
+ } else {
+ hash_del(&req->hash_node);
+ io_poll_complete(req, req->result, 0);
+ spin_unlock_irq(&ctx->completion_lock);
-static void io_poll_task_func(struct callback_head *cb)
-{
- struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
- struct io_ring_ctx *ctx = req->ctx;
- struct io_kiocb *nxt = NULL;
+ nxt = io_put_req_find_next(req);
+ io_cqring_ev_posted(ctx);
+ if (nxt)
+ __io_req_task_submit(nxt);
+ }
- io_poll_task_handler(req, &nxt);
- if (nxt)
- __io_req_task_submit(nxt);
percpu_ref_put(&ctx->refs);
}
@@ -4917,6 +5007,8 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
* for write). Setup a separate io_poll_iocb if this happens.
*/
if (unlikely(poll->head)) {
+ struct io_poll_iocb *poll_one = poll;
+
/* already have a 2nd entry, fail a third attempt */
if (*poll_ptr) {
pt->error = -EINVAL;
@@ -4927,7 +5019,7 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
pt->error = -ENOMEM;
return;
}
- io_init_poll_iocb(poll, req->poll.events, io_poll_double_wake);
+ io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
refcount_inc(&req->refs);
poll->wait.private = req;
*poll_ptr = poll;
@@ -5012,6 +5104,7 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
struct io_ring_ctx *ctx = req->ctx;
bool cancel = false;
+ INIT_HLIST_NODE(&req->hash_node);
io_init_poll_iocb(poll, mask, wake_func);
poll->file = req->file;
poll->wait.private = req;
@@ -5073,7 +5166,6 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
req->flags |= REQ_F_POLLED;
req->apoll = apoll;
- INIT_HLIST_NODE(&req->hash_node);
mask = 0;
if (def->pollin)
@@ -5144,9 +5236,8 @@ static bool io_poll_remove_one(struct io_kiocb *req)
if (do_complete) {
io_cqring_fill_event(req, -ECANCELED);
io_commit_cqring(req->ctx);
- req->flags |= REQ_F_COMP_LOCKED;
req_set_fail_links(req);
- io_put_req(req);
+ io_put_req_deferred(req, 1);
}
return do_complete;
@@ -5256,8 +5347,6 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
return -EINVAL;
if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
return -EINVAL;
- if (!poll->file)
- return -EBADF;
events = READ_ONCE(sqe->poll32_events);
#ifdef __BIG_ENDIAN
@@ -5275,7 +5364,6 @@ static int io_poll_add(struct io_kiocb *req)
struct io_poll_table ipt;
__poll_t mask;
- INIT_HLIST_NODE(&req->hash_node);
ipt.pt._qproc = io_poll_queue_proc;
mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
@@ -5328,9 +5416,8 @@ static int __io_timeout_cancel(struct io_kiocb *req)
list_del_init(&req->timeout.list);
req_set_fail_links(req);
- req->flags |= REQ_F_COMP_LOCKED;
io_cqring_fill_event(req, -ECANCELED);
- io_put_req(req);
+ io_put_req_deferred(req, 1);
return 0;
}
@@ -5740,9 +5827,9 @@ static void io_req_drop_files(struct io_kiocb *req)
wake_up(&ctx->inflight_wait);
spin_unlock_irqrestore(&ctx->inflight_lock, flags);
req->flags &= ~REQ_F_INFLIGHT;
- put_files_struct(req->work.files);
- put_nsproxy(req->work.nsproxy);
- req->work.files = NULL;
+ put_files_struct(req->work.identity->files);
+ put_nsproxy(req->work.identity->nsproxy);
+ req->work.flags &= ~IO_WQ_WORK_FILES;
}
static void __io_clean_op(struct io_kiocb *req)
@@ -6026,10 +6113,9 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
if (!list_empty(&req->link_list)) {
prev = list_entry(req->link_list.prev, struct io_kiocb,
link_list);
- if (refcount_inc_not_zero(&prev->refs)) {
+ if (refcount_inc_not_zero(&prev->refs))
list_del_init(&req->link_list);
- prev->flags &= ~REQ_F_LINK_TIMEOUT;
- } else
+ else
prev = NULL;
}
@@ -6086,6 +6172,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT)
return NULL;
+ nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
req->flags |= REQ_F_LINK_TIMEOUT;
return nxt;
}
@@ -6100,14 +6187,15 @@ static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs)
again:
linked_timeout = io_prep_linked_timeout(req);
- if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.creds &&
- req->work.creds != current_cred()) {
+ if ((req->flags & REQ_F_WORK_INITIALIZED) &&
+ (req->work.flags & IO_WQ_WORK_CREDS) &&
+ req->work.identity->creds != current_cred()) {
if (old_creds)
revert_creds(old_creds);
- if (old_creds == req->work.creds)
+ if (old_creds == req->work.identity->creds)
old_creds = NULL; /* restored original creds */
else
- old_creds = override_creds(req->work.creds);
+ old_creds = override_creds(req->work.identity->creds);
}
ret = io_issue_sqe(req, true, cs);
@@ -6148,8 +6236,10 @@ punt:
if (nxt) {
req = nxt;
- if (req->flags & REQ_F_FORCE_ASYNC)
+ if (req->flags & REQ_F_FORCE_ASYNC) {
+ linked_timeout = NULL;
goto punt;
+ }
goto again;
}
exit:
@@ -6410,11 +6500,17 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
id = READ_ONCE(sqe->personality);
if (id) {
- io_req_init_async(req);
- req->work.creds = idr_find(&ctx->personality_idr, id);
- if (unlikely(!req->work.creds))
+ struct io_identity *iod;
+
+ iod = idr_find(&ctx->personality_idr, id);
+ if (unlikely(!iod))
return -EINVAL;
- get_cred(req->work.creds);
+ refcount_inc(&iod->count);
+
+ __io_req_init_async(req);
+ get_cred(iod->creds);
+ req->work.identity = iod;
+ req->work.flags |= IO_WQ_WORK_CREDS;
}
/* same numerical values with corresponding REQ_F_*, safe to copy */
@@ -6447,7 +6543,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
if (!percpu_ref_tryget_many(&ctx->refs, nr))
return -EAGAIN;
- atomic_long_add(nr, &current->io_uring->req_issue);
+ percpu_counter_add(&current->io_uring->inflight, nr);
refcount_add(nr, &current->usage);
io_submit_state_start(&state, ctx, nr);
@@ -6489,10 +6585,12 @@ fail_req:
if (unlikely(submitted != nr)) {
int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
+ struct io_uring_task *tctx = current->io_uring;
+ int unused = nr - ref_used;
- percpu_ref_put_many(&ctx->refs, nr - ref_used);
- atomic_long_sub(nr - ref_used, &current->io_uring->req_issue);
- put_task_struct_many(current, nr - ref_used);
+ percpu_ref_put_many(&ctx->refs, unused);
+ percpu_counter_sub(&tctx->inflight, unused);
+ put_task_struct_many(current, unused);
}
if (link)
io_queue_link_head(link, &state.comp);
@@ -6672,6 +6770,10 @@ static int io_sq_thread(void *data)
old_cred = override_creds(ctx->creds);
}
io_sq_thread_associate_blkcg(ctx, &cur_css);
+#ifdef CONFIG_AUDIT
+ current->loginuid = ctx->loginuid;
+ current->sessionid = ctx->sessionid;
+#endif
ret |= __io_sq_thread(ctx, start_jiffies, cap_entries);
@@ -7306,7 +7408,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
spin_lock_init(&file_data->lock);
nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
- file_data->table = kcalloc(nr_tables, sizeof(file_data->table),
+ file_data->table = kcalloc(nr_tables, sizeof(*file_data->table),
GFP_KERNEL);
if (!file_data->table)
goto out_free;
@@ -7317,6 +7419,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
if (io_sqe_alloc_file_tables(file_data, nr_tables, nr_args))
goto out_ref;
+ ctx->file_data = file_data;
for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
struct fixed_file_table *table;
@@ -7351,7 +7454,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
table->files[index] = file;
}
- ctx->file_data = file_data;
ret = io_sqe_files_scm(ctx);
if (ret) {
io_sqe_files_unregister(ctx);
@@ -7384,6 +7486,7 @@ out_ref:
out_free:
kfree(file_data->table);
kfree(file_data);
+ ctx->file_data = NULL;
return ret;
}
@@ -7609,17 +7712,24 @@ out_fput:
static int io_uring_alloc_task_context(struct task_struct *task)
{
struct io_uring_task *tctx;
+ int ret;
tctx = kmalloc(sizeof(*tctx), GFP_KERNEL);
if (unlikely(!tctx))
return -ENOMEM;
+ ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
+ if (unlikely(ret)) {
+ kfree(tctx);
+ return ret;
+ }
+
xa_init(&tctx->xa);
init_waitqueue_head(&tctx->wait);
tctx->last = NULL;
tctx->in_idle = 0;
- atomic_long_set(&tctx->req_issue, 0);
- atomic_long_set(&tctx->req_complete, 0);
+ io_init_identity(&tctx->__identity);
+ tctx->identity = &tctx->__identity;
task->io_uring = tctx;
return 0;
}
@@ -7629,6 +7739,10 @@ void __io_uring_free(struct task_struct *tsk)
struct io_uring_task *tctx = tsk->io_uring;
WARN_ON_ONCE(!xa_empty(&tctx->xa));
+ WARN_ON_ONCE(refcount_read(&tctx->identity->count) != 1);
+ if (tctx->identity != &tctx->__identity)
+ kfree(tctx->identity);
+ percpu_counter_destroy(&tctx->inflight);
kfree(tctx);
tsk->io_uring = NULL;
}
@@ -8205,11 +8319,14 @@ static int io_uring_fasync(int fd, struct file *file, int on)
static int io_remove_personalities(int id, void *p, void *data)
{
struct io_ring_ctx *ctx = data;
- const struct cred *cred;
+ struct io_identity *iod;
- cred = idr_remove(&ctx->personality_idr, id);
- if (cred)
- put_cred(cred);
+ iod = idr_remove(&ctx->personality_idr, id);
+ if (iod) {
+ put_cred(iod->creds);
+ if (refcount_dec_and_test(&iod->count))
+ kfree(iod);
+ }
return 0;
}
@@ -8281,7 +8398,8 @@ static bool io_wq_files_match(struct io_wq_work *work, void *data)
{
struct files_struct *files = data;
- return !files || work->files == files;
+ return !files || ((work->flags & IO_WQ_WORK_FILES) &&
+ work->identity->files == files);
}
/*
@@ -8436,7 +8554,8 @@ static bool io_uring_cancel_files(struct io_ring_ctx *ctx,
spin_lock_irq(&ctx->inflight_lock);
list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
- if (files && req->work.files != files)
+ if (files && (req->work.flags & IO_WQ_WORK_FILES) &&
+ req->work.identity->files != files)
continue;
/* req is being completed, ignore */
if (!refcount_inc_not_zero(&req->refs))
@@ -8564,19 +8683,11 @@ static void io_uring_del_task_file(struct file *file)
fput(file);
}
-static void __io_uring_attempt_task_drop(struct file *file)
-{
- struct file *old = xa_load(&current->io_uring->xa, (unsigned long)file);
-
- if (old == file)
- io_uring_del_task_file(file);
-}
-
/*
* Drop task note for this file if we're the only ones that hold it after
* pending fput()
*/
-static void io_uring_attempt_task_drop(struct file *file, bool exiting)
+static void io_uring_attempt_task_drop(struct file *file)
{
if (!current->io_uring)
return;
@@ -8584,10 +8695,9 @@ static void io_uring_attempt_task_drop(struct file *file, bool exiting)
* fput() is pending, will be 2 if the only other ref is our potential
* task file note. If the task is exiting, drop regardless of count.
*/
- if (!exiting && atomic_long_read(&file->f_count) != 2)
- return;
-
- __io_uring_attempt_task_drop(file);
+ if (fatal_signal_pending(current) || (current->flags & PF_EXITING) ||
+ atomic_long_read(&file->f_count) == 2)
+ io_uring_del_task_file(file);
}
void __io_uring_files_cancel(struct files_struct *files)
@@ -8608,12 +8718,6 @@ void __io_uring_files_cancel(struct files_struct *files)
}
}
-static inline bool io_uring_task_idle(struct io_uring_task *tctx)
-{
- return atomic_long_read(&tctx->req_issue) ==
- atomic_long_read(&tctx->req_complete);
-}
-
/*
* Find any io_uring fd that this task has registered or done IO on, and cancel
* requests.
@@ -8622,14 +8726,16 @@ void __io_uring_task_cancel(void)
{
struct io_uring_task *tctx = current->io_uring;
DEFINE_WAIT(wait);
- long completions;
+ s64 inflight;
/* make sure overflow events are dropped */
tctx->in_idle = true;
- while (!io_uring_task_idle(tctx)) {
+ do {
/* read completions before cancelations */
- completions = atomic_long_read(&tctx->req_complete);
+ inflight = percpu_counter_sum(&tctx->inflight);
+ if (!inflight)
+ break;
__io_uring_files_cancel(NULL);
prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
@@ -8638,12 +8744,10 @@ void __io_uring_task_cancel(void)
* If we've seen completions, retry. This avoids a race where
* a completion comes in before we did prepare_to_wait().
*/
- if (completions != atomic_long_read(&tctx->req_complete))
+ if (inflight != percpu_counter_sum(&tctx->inflight))
continue;
- if (io_uring_task_idle(tctx))
- break;
schedule();
- }
+ } while (1);
finish_wait(&tctx->wait, &wait);
tctx->in_idle = false;
@@ -8651,16 +8755,7 @@ void __io_uring_task_cancel(void)
static int io_uring_flush(struct file *file, void *data)
{
- struct io_ring_ctx *ctx = file->private_data;
-
- /*
- * If the task is going away, cancel work it may have pending
- */
- if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
- data = NULL;
-
- io_uring_cancel_task_requests(ctx, data);
- io_uring_attempt_task_drop(file, !data);
+ io_uring_attempt_task_drop(file);
return 0;
}
@@ -9109,7 +9204,10 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
ctx->compat = in_compat_syscall();
ctx->user = user;
ctx->creds = get_current_cred();
-
+#ifdef CONFIG_AUDIT
+ ctx->loginuid = current->loginuid;
+ ctx->sessionid = current->sessionid;
+#endif
ctx->sqo_task = get_task_struct(current);
/*
@@ -9277,23 +9375,33 @@ out:
static int io_register_personality(struct io_ring_ctx *ctx)
{
- const struct cred *creds = get_current_cred();
- int id;
+ struct io_identity *id;
+ int ret;
+
+ id = kmalloc(sizeof(*id), GFP_KERNEL);
+ if (unlikely(!id))
+ return -ENOMEM;
- id = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1,
- USHRT_MAX, GFP_KERNEL);
- if (id < 0)
- put_cred(creds);
- return id;
+ io_init_identity(id);
+ id->creds = get_current_cred();
+
+ ret = idr_alloc_cyclic(&ctx->personality_idr, id, 1, USHRT_MAX, GFP_KERNEL);
+ if (ret < 0) {
+ put_cred(id->creds);
+ kfree(id);
+ }
+ return ret;
}
static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
{
- const struct cred *old_creds;
+ struct io_identity *iod;
- old_creds = idr_remove(&ctx->personality_idr, id);
- if (old_creds) {
- put_cred(old_creds);
+ iod = idr_remove(&ctx->personality_idr, id);
+ if (iod) {
+ put_cred(iod->creds);
+ if (refcount_dec_and_test(&iod->count))
+ kfree(iod);
return 0;
}
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 78f5c96c76f3..ec90773527ee 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1038,8 +1038,7 @@ static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail = 0;
buf->f_files = ISOFS_SB(sb)->s_ninodes;
buf->f_ffree = 0;
- buf->f_fsid.val[0] = (u32)id;
- buf->f_fsid.val[1] = (u32)(id >> 32);
+ buf->f_fsid = u64_to_fsid(id);
buf->f_namelen = NAME_MAX;
return 0;
}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 6d2da8ad0e6f..fa688e163a80 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -187,20 +187,48 @@ static int journal_wait_on_commit_record(journal_t *journal,
* use writepages() because with delayed allocation we may be doing
* block allocation in writepages().
*/
-static int journal_submit_inode_data_buffers(struct address_space *mapping,
- loff_t dirty_start, loff_t dirty_end)
+int jbd2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
{
- int ret;
+ struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = mapping->nrpages * 2,
- .range_start = dirty_start,
- .range_end = dirty_end,
+ .range_start = jinode->i_dirty_start,
+ .range_end = jinode->i_dirty_end,
};
- ret = generic_writepages(mapping, &wbc);
- return ret;
+ /*
+ * submit the inode data buffers. We use writepage
+ * instead of writepages. Because writepages can do
+ * block allocation with delalloc. We need to write
+ * only allocated blocks here.
+ */
+ return generic_writepages(mapping, &wbc);
+}
+
+/* Send all the data buffers related to an inode */
+int jbd2_submit_inode_data(struct jbd2_inode *jinode)
+{
+
+ if (!jinode || !(jinode->i_flags & JI_WRITE_DATA))
+ return 0;
+
+ trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
+ return jbd2_journal_submit_inode_data_buffers(jinode);
+
+}
+EXPORT_SYMBOL(jbd2_submit_inode_data);
+
+int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode)
+{
+ if (!jinode || !(jinode->i_flags & JI_WAIT_DATA) ||
+ !jinode->i_vfs_inode || !jinode->i_vfs_inode->i_mapping)
+ return 0;
+ return filemap_fdatawait_range_keep_errors(
+ jinode->i_vfs_inode->i_mapping, jinode->i_dirty_start,
+ jinode->i_dirty_end);
}
+EXPORT_SYMBOL(jbd2_wait_inode_data);
/*
* Submit all the data buffers of inode associated with the transaction to
@@ -215,29 +243,20 @@ static int journal_submit_data_buffers(journal_t *journal,
{
struct jbd2_inode *jinode;
int err, ret = 0;
- struct address_space *mapping;
spin_lock(&journal->j_list_lock);
list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
- loff_t dirty_start = jinode->i_dirty_start;
- loff_t dirty_end = jinode->i_dirty_end;
-
if (!(jinode->i_flags & JI_WRITE_DATA))
continue;
- mapping = jinode->i_vfs_inode->i_mapping;
jinode->i_flags |= JI_COMMIT_RUNNING;
spin_unlock(&journal->j_list_lock);
- /*
- * submit the inode data buffers. We use writepage
- * instead of writepages. Because writepages can do
- * block allocation with delalloc. We need to write
- * only allocated blocks here.
- */
+ /* submit the inode data buffers. */
trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
- err = journal_submit_inode_data_buffers(mapping, dirty_start,
- dirty_end);
- if (!ret)
- ret = err;
+ if (journal->j_submit_inode_data_buffers) {
+ err = journal->j_submit_inode_data_buffers(jinode);
+ if (!ret)
+ ret = err;
+ }
spin_lock(&journal->j_list_lock);
J_ASSERT(jinode->i_transaction == commit_transaction);
jinode->i_flags &= ~JI_COMMIT_RUNNING;
@@ -248,6 +267,15 @@ static int journal_submit_data_buffers(journal_t *journal,
return ret;
}
+int jbd2_journal_finish_inode_data_buffers(struct jbd2_inode *jinode)
+{
+ struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
+
+ return filemap_fdatawait_range_keep_errors(mapping,
+ jinode->i_dirty_start,
+ jinode->i_dirty_end);
+}
+
/*
* Wait for data submitted for writeout, refile inodes to proper
* transaction if needed.
@@ -262,18 +290,16 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
/* For locking, see the comment in journal_submit_data_buffers() */
spin_lock(&journal->j_list_lock);
list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
- loff_t dirty_start = jinode->i_dirty_start;
- loff_t dirty_end = jinode->i_dirty_end;
-
if (!(jinode->i_flags & JI_WAIT_DATA))
continue;
jinode->i_flags |= JI_COMMIT_RUNNING;
spin_unlock(&journal->j_list_lock);
- err = filemap_fdatawait_range_keep_errors(
- jinode->i_vfs_inode->i_mapping, dirty_start,
- dirty_end);
- if (!ret)
- ret = err;
+ /* wait for the inode data buffers writeout. */
+ if (journal->j_finish_inode_data_buffers) {
+ err = journal->j_finish_inode_data_buffers(jinode);
+ if (!ret)
+ ret = err;
+ }
spin_lock(&journal->j_list_lock);
jinode->i_flags &= ~JI_COMMIT_RUNNING;
smp_mb();
@@ -413,6 +439,20 @@ void jbd2_journal_commit_transaction(journal_t *journal)
J_ASSERT(journal->j_running_transaction != NULL);
J_ASSERT(journal->j_committing_transaction == NULL);
+ write_lock(&journal->j_state_lock);
+ journal->j_flags |= JBD2_FULL_COMMIT_ONGOING;
+ while (journal->j_flags & JBD2_FAST_COMMIT_ONGOING) {
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(&journal->j_fc_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ write_unlock(&journal->j_state_lock);
+ schedule();
+ write_lock(&journal->j_state_lock);
+ finish_wait(&journal->j_fc_wait, &wait);
+ }
+ write_unlock(&journal->j_state_lock);
+
commit_transaction = journal->j_running_transaction;
trace_jbd2_start_commit(journal, commit_transaction);
@@ -420,6 +460,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
commit_transaction->t_tid);
write_lock(&journal->j_state_lock);
+ journal->j_fc_off = 0;
J_ASSERT(commit_transaction->t_state == T_RUNNING);
commit_transaction->t_state = T_LOCKED;
@@ -1119,12 +1160,16 @@ restart_loop:
if (journal->j_commit_callback)
journal->j_commit_callback(journal, commit_transaction);
+ if (journal->j_fc_cleanup_callback)
+ journal->j_fc_cleanup_callback(journal, 1);
trace_jbd2_end_commit(journal, commit_transaction);
jbd_debug(1, "JBD2: commit %d complete, head %d\n",
journal->j_commit_sequence, journal->j_tail_sequence);
write_lock(&journal->j_state_lock);
+ journal->j_flags &= ~JBD2_FULL_COMMIT_ONGOING;
+ journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING;
spin_lock(&journal->j_list_lock);
commit_transaction->t_state = T_FINISHED;
/* Check if the transaction can be dropped now that we are finished */
@@ -1136,6 +1181,7 @@ restart_loop:
spin_unlock(&journal->j_list_lock);
write_unlock(&journal->j_state_lock);
wake_up(&journal->j_wait_done_commit);
+ wake_up(&journal->j_fc_wait);
/*
* Calculate overall stats
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 17fdc482f554..0c7c42bd530f 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -91,6 +91,8 @@ EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
EXPORT_SYMBOL(jbd2_journal_force_commit);
EXPORT_SYMBOL(jbd2_journal_inode_ranged_write);
EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait);
+EXPORT_SYMBOL(jbd2_journal_submit_inode_data_buffers);
+EXPORT_SYMBOL(jbd2_journal_finish_inode_data_buffers);
EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
@@ -157,7 +159,9 @@ static void commit_timeout(struct timer_list *t)
*
* 1) COMMIT: Every so often we need to commit the current state of the
* filesystem to disk. The journal thread is responsible for writing
- * all of the metadata buffers to disk.
+ * all of the metadata buffers to disk. If a fast commit is ongoing
+ * journal thread waits until it's done and then continues from
+ * there on.
*
* 2) CHECKPOINT: We cannot reuse a used section of the log file until all
* of the data in that part of the log has been rewritten elsewhere on
@@ -714,6 +718,75 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
return err;
}
+/*
+ * Start a fast commit. If there's an ongoing fast or full commit wait for
+ * it to complete. Returns 0 if a new fast commit was started. Returns -EALREADY
+ * if a fast commit is not needed, either because there's an already a commit
+ * going on or this tid has already been committed. Returns -EINVAL if no jbd2
+ * commit has yet been performed.
+ */
+int jbd2_fc_begin_commit(journal_t *journal, tid_t tid)
+{
+ /*
+ * Fast commits only allowed if at least one full commit has
+ * been processed.
+ */
+ if (!journal->j_stats.ts_tid)
+ return -EINVAL;
+
+ if (tid <= journal->j_commit_sequence)
+ return -EALREADY;
+
+ write_lock(&journal->j_state_lock);
+ if (journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
+ (journal->j_flags & JBD2_FAST_COMMIT_ONGOING)) {
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(&journal->j_fc_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ write_unlock(&journal->j_state_lock);
+ schedule();
+ finish_wait(&journal->j_fc_wait, &wait);
+ return -EALREADY;
+ }
+ journal->j_flags |= JBD2_FAST_COMMIT_ONGOING;
+ write_unlock(&journal->j_state_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(jbd2_fc_begin_commit);
+
+/*
+ * Stop a fast commit. If fallback is set, this function starts commit of
+ * TID tid before any other fast commit can start.
+ */
+static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback)
+{
+ if (journal->j_fc_cleanup_callback)
+ journal->j_fc_cleanup_callback(journal, 0);
+ write_lock(&journal->j_state_lock);
+ journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING;
+ if (fallback)
+ journal->j_flags |= JBD2_FULL_COMMIT_ONGOING;
+ write_unlock(&journal->j_state_lock);
+ wake_up(&journal->j_fc_wait);
+ if (fallback)
+ return jbd2_complete_transaction(journal, tid);
+ return 0;
+}
+
+int jbd2_fc_end_commit(journal_t *journal)
+{
+ return __jbd2_fc_end_commit(journal, 0, 0);
+}
+EXPORT_SYMBOL(jbd2_fc_end_commit);
+
+int jbd2_fc_end_commit_fallback(journal_t *journal, tid_t tid)
+{
+ return __jbd2_fc_end_commit(journal, tid, 1);
+}
+EXPORT_SYMBOL(jbd2_fc_end_commit_fallback);
+
/* Return 1 when transaction with given tid has already committed. */
int jbd2_transaction_committed(journal_t *journal, tid_t tid)
{
@@ -782,6 +855,110 @@ int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
return jbd2_journal_bmap(journal, blocknr, retp);
}
+/* Map one fast commit buffer for use by the file system */
+int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out)
+{
+ unsigned long long pblock;
+ unsigned long blocknr;
+ int ret = 0;
+ struct buffer_head *bh;
+ int fc_off;
+
+ *bh_out = NULL;
+ write_lock(&journal->j_state_lock);
+
+ if (journal->j_fc_off + journal->j_fc_first < journal->j_fc_last) {
+ fc_off = journal->j_fc_off;
+ blocknr = journal->j_fc_first + fc_off;
+ journal->j_fc_off++;
+ } else {
+ ret = -EINVAL;
+ }
+ write_unlock(&journal->j_state_lock);
+
+ if (ret)
+ return ret;
+
+ ret = jbd2_journal_bmap(journal, blocknr, &pblock);
+ if (ret)
+ return ret;
+
+ bh = __getblk(journal->j_dev, pblock, journal->j_blocksize);
+ if (!bh)
+ return -ENOMEM;
+
+ lock_buffer(bh);
+
+ clear_buffer_uptodate(bh);
+ set_buffer_dirty(bh);
+ unlock_buffer(bh);
+ journal->j_fc_wbuf[fc_off] = bh;
+
+ *bh_out = bh;
+
+ return 0;
+}
+EXPORT_SYMBOL(jbd2_fc_get_buf);
+
+/*
+ * Wait on fast commit buffers that were allocated by jbd2_fc_get_buf
+ * for completion.
+ */
+int jbd2_fc_wait_bufs(journal_t *journal, int num_blks)
+{
+ struct buffer_head *bh;
+ int i, j_fc_off;
+
+ read_lock(&journal->j_state_lock);
+ j_fc_off = journal->j_fc_off;
+ read_unlock(&journal->j_state_lock);
+
+ /*
+ * Wait in reverse order to minimize chances of us being woken up before
+ * all IOs have completed
+ */
+ for (i = j_fc_off - 1; i >= j_fc_off - num_blks; i--) {
+ bh = journal->j_fc_wbuf[i];
+ wait_on_buffer(bh);
+ put_bh(bh);
+ journal->j_fc_wbuf[i] = NULL;
+ if (unlikely(!buffer_uptodate(bh)))
+ return -EIO;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(jbd2_fc_wait_bufs);
+
+/*
+ * Wait on fast commit buffers that were allocated by jbd2_fc_get_buf
+ * for completion.
+ */
+int jbd2_fc_release_bufs(journal_t *journal)
+{
+ struct buffer_head *bh;
+ int i, j_fc_off;
+
+ read_lock(&journal->j_state_lock);
+ j_fc_off = journal->j_fc_off;
+ read_unlock(&journal->j_state_lock);
+
+ /*
+ * Wait in reverse order to minimize chances of us being woken up before
+ * all IOs have completed
+ */
+ for (i = j_fc_off - 1; i >= 0; i--) {
+ bh = journal->j_fc_wbuf[i];
+ if (!bh)
+ break;
+ put_bh(bh);
+ journal->j_fc_wbuf[i] = NULL;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(jbd2_fc_release_bufs);
+
/*
* Conversion of logical to physical block numbers for the journal
*
@@ -1140,6 +1317,7 @@ static journal_t *journal_init_common(struct block_device *bdev,
init_waitqueue_head(&journal->j_wait_commit);
init_waitqueue_head(&journal->j_wait_updates);
init_waitqueue_head(&journal->j_wait_reserved);
+ init_waitqueue_head(&journal->j_fc_wait);
mutex_init(&journal->j_abort_mutex);
mutex_init(&journal->j_barrier);
mutex_init(&journal->j_checkpoint_mutex);
@@ -1179,6 +1357,14 @@ static journal_t *journal_init_common(struct block_device *bdev,
if (!journal->j_wbuf)
goto err_cleanup;
+ if (journal->j_fc_wbufsize > 0) {
+ journal->j_fc_wbuf = kmalloc_array(journal->j_fc_wbufsize,
+ sizeof(struct buffer_head *),
+ GFP_KERNEL);
+ if (!journal->j_fc_wbuf)
+ goto err_cleanup;
+ }
+
bh = getblk_unmovable(journal->j_dev, start, journal->j_blocksize);
if (!bh) {
pr_err("%s: Cannot get buffer for journal superblock\n",
@@ -1192,11 +1378,23 @@ static journal_t *journal_init_common(struct block_device *bdev,
err_cleanup:
kfree(journal->j_wbuf);
+ kfree(journal->j_fc_wbuf);
jbd2_journal_destroy_revoke(journal);
kfree(journal);
return NULL;
}
+int jbd2_fc_init(journal_t *journal, int num_fc_blks)
+{
+ journal->j_fc_wbufsize = num_fc_blks;
+ journal->j_fc_wbuf = kmalloc_array(journal->j_fc_wbufsize,
+ sizeof(struct buffer_head *), GFP_KERNEL);
+ if (!journal->j_fc_wbuf)
+ return -ENOMEM;
+ return 0;
+}
+EXPORT_SYMBOL(jbd2_fc_init);
+
/* jbd2_journal_init_dev and jbd2_journal_init_inode:
*
* Create a journal structure assigned some fixed set of disk blocks to
@@ -1314,11 +1512,20 @@ static int journal_reset(journal_t *journal)
}
journal->j_first = first;
- journal->j_last = last;
- journal->j_head = first;
- journal->j_tail = first;
- journal->j_free = last - first;
+ if (jbd2_has_feature_fast_commit(journal) &&
+ journal->j_fc_wbufsize > 0) {
+ journal->j_fc_last = last;
+ journal->j_last = last - journal->j_fc_wbufsize;
+ journal->j_fc_first = journal->j_last + 1;
+ journal->j_fc_off = 0;
+ } else {
+ journal->j_last = last;
+ }
+
+ journal->j_head = journal->j_first;
+ journal->j_tail = journal->j_first;
+ journal->j_free = journal->j_last - journal->j_first;
journal->j_tail_sequence = journal->j_transaction_sequence;
journal->j_commit_sequence = journal->j_transaction_sequence - 1;
@@ -1464,6 +1671,7 @@ out:
static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
{
journal_superblock_t *sb = journal->j_superblock;
+ bool had_fast_commit = false;
BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
lock_buffer(journal->j_sb_buffer);
@@ -1477,9 +1685,20 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
sb->s_start = cpu_to_be32(0);
+ if (jbd2_has_feature_fast_commit(journal)) {
+ /*
+ * When journal is clean, no need to commit fast commit flag and
+ * make file system incompatible with older kernels.
+ */
+ jbd2_clear_feature_fast_commit(journal);
+ had_fast_commit = true;
+ }
jbd2_write_superblock(journal, write_op);
+ if (had_fast_commit)
+ jbd2_set_feature_fast_commit(journal);
+
/* Log is no longer empty */
write_lock(&journal->j_state_lock);
journal->j_flags |= JBD2_FLUSHED;
@@ -1663,9 +1882,18 @@ static int load_superblock(journal_t *journal)
journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
journal->j_tail = be32_to_cpu(sb->s_start);
journal->j_first = be32_to_cpu(sb->s_first);
- journal->j_last = be32_to_cpu(sb->s_maxlen);
journal->j_errno = be32_to_cpu(sb->s_errno);
+ if (jbd2_has_feature_fast_commit(journal) &&
+ journal->j_fc_wbufsize > 0) {
+ journal->j_fc_last = be32_to_cpu(sb->s_maxlen);
+ journal->j_last = journal->j_fc_last - journal->j_fc_wbufsize;
+ journal->j_fc_first = journal->j_last + 1;
+ journal->j_fc_off = 0;
+ } else {
+ journal->j_last = be32_to_cpu(sb->s_maxlen);
+ }
+
return 0;
}
@@ -1726,6 +1954,9 @@ int jbd2_journal_load(journal_t *journal)
*/
journal->j_flags &= ~JBD2_ABORT;
+ if (journal->j_fc_wbufsize > 0)
+ jbd2_journal_set_features(journal, 0, 0,
+ JBD2_FEATURE_INCOMPAT_FAST_COMMIT);
/* OK, we've finished with the dynamic journal bits:
* reinitialise the dynamic contents of the superblock in memory
* and reset them on disk. */
@@ -1809,6 +2040,8 @@ int jbd2_journal_destroy(journal_t *journal)
jbd2_journal_destroy_revoke(journal);
if (journal->j_chksum_driver)
crypto_free_shash(journal->j_chksum_driver);
+ if (journal->j_fc_wbufsize > 0)
+ kfree(journal->j_fc_wbuf);
kfree(journal->j_wbuf);
kfree(journal);
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index faa97d748474..eb2606133cd8 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -35,7 +35,6 @@ struct recovery_info
int nr_revoke_hits;
};
-enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
static int do_one_pass(journal_t *journal,
struct recovery_info *info, enum passtype pass);
static int scan_revoke_records(journal_t *, struct buffer_head *,
@@ -225,10 +224,51 @@ static int count_tags(journal_t *journal, struct buffer_head *bh)
/* Make sure we wrap around the log correctly! */
#define wrap(journal, var) \
do { \
- if (var >= (journal)->j_last) \
- var -= ((journal)->j_last - (journal)->j_first); \
+ unsigned long _wrap_last = \
+ jbd2_has_feature_fast_commit(journal) ? \
+ (journal)->j_fc_last : (journal)->j_last; \
+ \
+ if (var >= _wrap_last) \
+ var -= (_wrap_last - (journal)->j_first); \
} while (0)
+static int fc_do_one_pass(journal_t *journal,
+ struct recovery_info *info, enum passtype pass)
+{
+ unsigned int expected_commit_id = info->end_transaction;
+ unsigned long next_fc_block;
+ struct buffer_head *bh;
+ int err = 0;
+
+ next_fc_block = journal->j_fc_first;
+ if (!journal->j_fc_replay_callback)
+ return 0;
+
+ while (next_fc_block <= journal->j_fc_last) {
+ jbd_debug(3, "Fast commit replay: next block %ld",
+ next_fc_block);
+ err = jread(&bh, journal, next_fc_block);
+ if (err) {
+ jbd_debug(3, "Fast commit replay: read error");
+ break;
+ }
+
+ jbd_debug(3, "Processing fast commit blk with seq %d");
+ err = journal->j_fc_replay_callback(journal, bh, pass,
+ next_fc_block - journal->j_fc_first,
+ expected_commit_id);
+ next_fc_block++;
+ if (err < 0 || err == JBD2_FC_REPLAY_STOP)
+ break;
+ err = 0;
+ }
+
+ if (err)
+ jbd_debug(3, "Fast commit replay failed, err = %d\n", err);
+
+ return err;
+}
+
/**
* jbd2_journal_recover - recovers a on-disk journal
* @journal: the journal to recover
@@ -428,6 +468,8 @@ static int do_one_pass(journal_t *journal,
__u32 crc32_sum = ~0; /* Transactional Checksums */
int descr_csum_size = 0;
int block_error = 0;
+ bool need_check_commit_time = false;
+ __u64 last_trans_commit_time = 0, commit_time;
/*
* First thing is to establish what we expect to find in the log
@@ -470,7 +512,9 @@ static int do_one_pass(journal_t *journal,
break;
jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
- next_commit_ID, next_log_block, journal->j_last);
+ next_commit_ID, next_log_block,
+ jbd2_has_feature_fast_commit(journal) ?
+ journal->j_fc_last : journal->j_last);
/* Skip over each chunk of the transaction looking
* either the next descriptor block or the final commit
@@ -520,12 +564,21 @@ static int do_one_pass(journal_t *journal,
if (descr_csum_size > 0 &&
!jbd2_descriptor_block_csum_verify(journal,
bh->b_data)) {
- printk(KERN_ERR "JBD2: Invalid checksum "
- "recovering block %lu in log\n",
- next_log_block);
- err = -EFSBADCRC;
- brelse(bh);
- goto failed;
+ /*
+ * PASS_SCAN can see stale blocks due to lazy
+ * journal init. Don't error out on those yet.
+ */
+ if (pass != PASS_SCAN) {
+ pr_err("JBD2: Invalid checksum recovering block %lu in log\n",
+ next_log_block);
+ err = -EFSBADCRC;
+ brelse(bh);
+ goto failed;
+ }
+ need_check_commit_time = true;
+ jbd_debug(1,
+ "invalid descriptor block found in %lu\n",
+ next_log_block);
}
/* If it is a valid descriptor block, replay it
@@ -535,6 +588,7 @@ static int do_one_pass(journal_t *journal,
if (pass != PASS_REPLAY) {
if (pass == PASS_SCAN &&
jbd2_has_feature_checksum(journal) &&
+ !need_check_commit_time &&
!info->end_transaction) {
if (calc_chksums(journal, bh,
&next_log_block,
@@ -683,11 +737,41 @@ static int do_one_pass(journal_t *journal,
* mentioned conditions. Hence assume
* "Interrupted Commit".)
*/
+ commit_time = be64_to_cpu(
+ ((struct commit_header *)bh->b_data)->h_commit_sec);
+ /*
+ * If need_check_commit_time is set, it means we are in
+ * PASS_SCAN and csum verify failed before. If
+ * commit_time is increasing, it's the same journal,
+ * otherwise it is stale journal block, just end this
+ * recovery.
+ */
+ if (need_check_commit_time) {
+ if (commit_time >= last_trans_commit_time) {
+ pr_err("JBD2: Invalid checksum found in transaction %u\n",
+ next_commit_ID);
+ err = -EFSBADCRC;
+ brelse(bh);
+ goto failed;
+ }
+ ignore_crc_mismatch:
+ /*
+ * It likely does not belong to same journal,
+ * just end this recovery with success.
+ */
+ jbd_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n",
+ next_commit_ID);
+ err = 0;
+ brelse(bh);
+ goto done;
+ }
- /* Found an expected commit block: if checksums
- * are present verify them in PASS_SCAN; else not
+ /*
+ * Found an expected commit block: if checksums
+ * are present, verify them in PASS_SCAN; else not
* much to do other than move on to the next sequence
- * number. */
+ * number.
+ */
if (pass == PASS_SCAN &&
jbd2_has_feature_checksum(journal)) {
struct commit_header *cbh =
@@ -719,6 +803,8 @@ static int do_one_pass(journal_t *journal,
!jbd2_commit_block_csum_verify(journal,
bh->b_data)) {
chksum_error:
+ if (commit_time < last_trans_commit_time)
+ goto ignore_crc_mismatch;
info->end_transaction = next_commit_ID;
if (!jbd2_has_feature_async_commit(journal)) {
@@ -728,11 +814,24 @@ static int do_one_pass(journal_t *journal,
break;
}
}
+ if (pass == PASS_SCAN)
+ last_trans_commit_time = commit_time;
brelse(bh);
next_commit_ID++;
continue;
case JBD2_REVOKE_BLOCK:
+ /*
+ * Check revoke block crc in pass_scan, if csum verify
+ * failed, check commit block time later.
+ */
+ if (pass == PASS_SCAN &&
+ !jbd2_descriptor_block_csum_verify(journal,
+ bh->b_data)) {
+ jbd_debug(1, "JBD2: invalid revoke block found in %lu\n",
+ next_log_block);
+ need_check_commit_time = true;
+ }
/* If we aren't in the REVOKE pass, then we can
* just skip over this block. */
if (pass != PASS_REVOKE) {
@@ -777,6 +876,13 @@ static int do_one_pass(journal_t *journal,
success = -EIO;
}
}
+
+ if (jbd2_has_feature_fast_commit(journal) && pass != PASS_REVOKE) {
+ err = fc_do_one_pass(journal, info, pass);
+ if (err)
+ success = err;
+ }
+
if (block_error && success == 0)
success = -EIO;
return success;
@@ -800,9 +906,6 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
offset = sizeof(jbd2_journal_revoke_header_t);
rcount = be32_to_cpu(header->r_count);
- if (!jbd2_descriptor_block_csum_verify(journal, header))
- return -EFSBADCRC;
-
if (jbd2_journal_has_csum_v2or3(journal))
csum_size = sizeof(struct jbd2_journal_block_tail);
if (rcount > journal->j_blocksize - csum_size)
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 1eabd91870e6..1d9488cf0534 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -417,7 +417,7 @@ void nsm_release(struct nsm_handle *nsm)
/*
* XDR functions for NSM.
*
- * See http://www.opengroup.org/ for details on the Network
+ * See https://www.opengroup.org/ for details on the Network
* Status Monitor wire protocol.
*/
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index e4d3f783e06a..fa41dda39925 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -486,65 +486,215 @@ nlm4svc_proc_granted_res(struct svc_rqst *rqstp)
return rpc_success;
}
+static __be32
+nlm4svc_proc_unused(struct svc_rqst *rqstp)
+{
+ return rpc_proc_unavail;
+}
+
/*
* NLM Server procedures.
*/
-#define nlm4svc_encode_norep nlm4svc_encode_void
-#define nlm4svc_decode_norep nlm4svc_decode_void
-#define nlm4svc_decode_testres nlm4svc_decode_void
-#define nlm4svc_decode_lockres nlm4svc_decode_void
-#define nlm4svc_decode_unlockres nlm4svc_decode_void
-#define nlm4svc_decode_cancelres nlm4svc_decode_void
-#define nlm4svc_decode_grantedres nlm4svc_decode_void
-
-#define nlm4svc_proc_none nlm4svc_proc_null
-#define nlm4svc_proc_test_res nlm4svc_proc_null
-#define nlm4svc_proc_lock_res nlm4svc_proc_null
-#define nlm4svc_proc_cancel_res nlm4svc_proc_null
-#define nlm4svc_proc_unlock_res nlm4svc_proc_null
-
struct nlm_void { int dummy; };
-#define PROC(name, xargt, xrest, argt, rest, respsize) \
- { .pc_func = nlm4svc_proc_##name, \
- .pc_decode = nlm4svc_decode_##xargt, \
- .pc_encode = nlm4svc_encode_##xrest, \
- .pc_release = NULL, \
- .pc_argsize = sizeof(struct nlm_##argt), \
- .pc_ressize = sizeof(struct nlm_##rest), \
- .pc_xdrressize = respsize, \
- }
#define Ck (1+XDR_QUADLEN(NLM_MAXCOOKIELEN)) /* cookie */
#define No (1+1024/4) /* netobj */
#define St 1 /* status */
#define Rg 4 /* range (offset + length) */
-const struct svc_procedure nlmsvc_procedures4[] = {
- PROC(null, void, void, void, void, 1),
- PROC(test, testargs, testres, args, res, Ck+St+2+No+Rg),
- PROC(lock, lockargs, res, args, res, Ck+St),
- PROC(cancel, cancargs, res, args, res, Ck+St),
- PROC(unlock, unlockargs, res, args, res, Ck+St),
- PROC(granted, testargs, res, args, res, Ck+St),
- PROC(test_msg, testargs, norep, args, void, 1),
- PROC(lock_msg, lockargs, norep, args, void, 1),
- PROC(cancel_msg, cancargs, norep, args, void, 1),
- PROC(unlock_msg, unlockargs, norep, args, void, 1),
- PROC(granted_msg, testargs, norep, args, void, 1),
- PROC(test_res, testres, norep, res, void, 1),
- PROC(lock_res, lockres, norep, res, void, 1),
- PROC(cancel_res, cancelres, norep, res, void, 1),
- PROC(unlock_res, unlockres, norep, res, void, 1),
- PROC(granted_res, res, norep, res, void, 1),
- /* statd callback */
- PROC(sm_notify, reboot, void, reboot, void, 1),
- PROC(none, void, void, void, void, 0),
- PROC(none, void, void, void, void, 0),
- PROC(none, void, void, void, void, 0),
- PROC(share, shareargs, shareres, args, res, Ck+St+1),
- PROC(unshare, shareargs, shareres, args, res, Ck+St+1),
- PROC(nm_lock, lockargs, res, args, res, Ck+St),
- PROC(free_all, notify, void, args, void, 1),
+const struct svc_procedure nlmsvc_procedures4[24] = {
+ [NLMPROC_NULL] = {
+ .pc_func = nlm4svc_proc_null,
+ .pc_decode = nlm4svc_decode_void,
+ .pc_encode = nlm4svc_encode_void,
+ .pc_argsize = sizeof(struct nlm_void),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
+ },
+ [NLMPROC_TEST] = {
+ .pc_func = nlm4svc_proc_test,
+ .pc_decode = nlm4svc_decode_testargs,
+ .pc_encode = nlm4svc_encode_testres,
+ .pc_argsize = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_res),
+ .pc_xdrressize = Ck+St+2+No+Rg,
+ },
+ [NLMPROC_LOCK] = {
+ .pc_func = nlm4svc_proc_lock,
+ .pc_decode = nlm4svc_decode_lockargs,
+ .pc_encode = nlm4svc_encode_res,
+ .pc_argsize = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_res),
+ .pc_xdrressize = Ck+St,
+ },
+ [NLMPROC_CANCEL] = {
+ .pc_func = nlm4svc_proc_cancel,
+ .pc_decode = nlm4svc_decode_cancargs,
+ .pc_encode = nlm4svc_encode_res,
+ .pc_argsize = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_res),
+ .pc_xdrressize = Ck+St,
+ },
+ [NLMPROC_UNLOCK] = {
+ .pc_func = nlm4svc_proc_unlock,
+ .pc_decode = nlm4svc_decode_unlockargs,
+ .pc_encode = nlm4svc_encode_res,
+ .pc_argsize = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_res),
+ .pc_xdrressize = Ck+St,
+ },
+ [NLMPROC_GRANTED] = {
+ .pc_func = nlm4svc_proc_granted,
+ .pc_decode = nlm4svc_decode_testargs,
+ .pc_encode = nlm4svc_encode_res,
+ .pc_argsize = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_res),
+ .pc_xdrressize = Ck+St,
+ },
+ [NLMPROC_TEST_MSG] = {
+ .pc_func = nlm4svc_proc_test_msg,
+ .pc_decode = nlm4svc_decode_testargs,
+ .pc_encode = nlm4svc_encode_void,
+ .pc_argsize = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
+ },
+ [NLMPROC_LOCK_MSG] = {
+ .pc_func = nlm4svc_proc_lock_msg,
+ .pc_decode = nlm4svc_decode_lockargs,
+ .pc_encode = nlm4svc_encode_void,
+ .pc_argsize = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
+ },
+ [NLMPROC_CANCEL_MSG] = {
+ .pc_func = nlm4svc_proc_cancel_msg,
+ .pc_decode = nlm4svc_decode_cancargs,
+ .pc_encode = nlm4svc_encode_void,
+ .pc_argsize = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
+ },
+ [NLMPROC_UNLOCK_MSG] = {
+ .pc_func = nlm4svc_proc_unlock_msg,
+ .pc_decode = nlm4svc_decode_unlockargs,
+ .pc_encode = nlm4svc_encode_void,
+ .pc_argsize = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
+ },
+ [NLMPROC_GRANTED_MSG] = {
+ .pc_func = nlm4svc_proc_granted_msg,
+ .pc_decode = nlm4svc_decode_testargs,
+ .pc_encode = nlm4svc_encode_void,
+ .pc_argsize = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
+ },
+ [NLMPROC_TEST_RES] = {
+ .pc_func = nlm4svc_proc_null,
+ .pc_decode = nlm4svc_decode_void,
+ .pc_encode = nlm4svc_encode_void,
+ .pc_argsize = sizeof(struct nlm_res),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
+ },
+ [NLMPROC_LOCK_RES] = {
+ .pc_func = nlm4svc_proc_null,
+ .pc_decode = nlm4svc_decode_void,
+ .pc_encode = nlm4svc_encode_void,
+ .pc_argsize = sizeof(struct nlm_res),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
+ },
+ [NLMPROC_CANCEL_RES] = {
+ .pc_func = nlm4svc_proc_null,
+ .pc_decode = nlm4svc_decode_void,
+ .pc_encode = nlm4svc_encode_void,
+ .pc_argsize = sizeof(struct nlm_res),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
+ },
+ [NLMPROC_UNLOCK_RES] = {
+ .pc_func = nlm4svc_proc_null,
+ .pc_decode = nlm4svc_decode_void,
+ .pc_encode = nlm4svc_encode_void,
+ .pc_argsize = sizeof(struct nlm_res),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
+ },
+ [NLMPROC_GRANTED_RES] = {
+ .pc_func = nlm4svc_proc_granted_res,
+ .pc_decode = nlm4svc_decode_res,
+ .pc_encode = nlm4svc_encode_void,
+ .pc_argsize = sizeof(struct nlm_res),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
+ },
+ [NLMPROC_NSM_NOTIFY] = {
+ .pc_func = nlm4svc_proc_sm_notify,
+ .pc_decode = nlm4svc_decode_reboot,
+ .pc_encode = nlm4svc_encode_void,
+ .pc_argsize = sizeof(struct nlm_reboot),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
+ },
+ [17] = {
+ .pc_func = nlm4svc_proc_unused,
+ .pc_decode = nlm4svc_decode_void,
+ .pc_encode = nlm4svc_encode_void,
+ .pc_argsize = sizeof(struct nlm_void),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = 0,
+ },
+ [18] = {
+ .pc_func = nlm4svc_proc_unused,
+ .pc_decode = nlm4svc_decode_void,
+ .pc_encode = nlm4svc_encode_void,
+ .pc_argsize = sizeof(struct nlm_void),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = 0,
+ },
+ [19] = {
+ .pc_func = nlm4svc_proc_unused,
+ .pc_decode = nlm4svc_decode_void,
+ .pc_encode = nlm4svc_encode_void,
+ .pc_argsize = sizeof(struct nlm_void),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = 0,
+ },
+ [NLMPROC_SHARE] = {
+ .pc_func = nlm4svc_proc_share,
+ .pc_decode = nlm4svc_decode_shareargs,
+ .pc_encode = nlm4svc_encode_shareres,
+ .pc_argsize = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_res),
+ .pc_xdrressize = Ck+St+1,
+ },
+ [NLMPROC_UNSHARE] = {
+ .pc_func = nlm4svc_proc_unshare,
+ .pc_decode = nlm4svc_decode_shareargs,
+ .pc_encode = nlm4svc_encode_shareres,
+ .pc_argsize = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_res),
+ .pc_xdrressize = Ck+St+1,
+ },
+ [NLMPROC_NM_LOCK] = {
+ .pc_func = nlm4svc_proc_nm_lock,
+ .pc_decode = nlm4svc_decode_lockargs,
+ .pc_encode = nlm4svc_encode_res,
+ .pc_argsize = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_res),
+ .pc_xdrressize = Ck+St,
+ },
+ [NLMPROC_FREE_ALL] = {
+ .pc_func = nlm4svc_proc_free_all,
+ .pc_decode = nlm4svc_decode_notify,
+ .pc_encode = nlm4svc_encode_void,
+ .pc_argsize = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
+ },
};
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index d0bb7a6bf005..50855f2c1f4b 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -529,66 +529,214 @@ nlmsvc_proc_granted_res(struct svc_rqst *rqstp)
return rpc_success;
}
+static __be32
+nlmsvc_proc_unused(struct svc_rqst *rqstp)
+{
+ return rpc_proc_unavail;
+}
+
/*
* NLM Server procedures.
*/
-#define nlmsvc_encode_norep nlmsvc_encode_void
-#define nlmsvc_decode_norep nlmsvc_decode_void
-#define nlmsvc_decode_testres nlmsvc_decode_void
-#define nlmsvc_decode_lockres nlmsvc_decode_void
-#define nlmsvc_decode_unlockres nlmsvc_decode_void
-#define nlmsvc_decode_cancelres nlmsvc_decode_void
-#define nlmsvc_decode_grantedres nlmsvc_decode_void
-
-#define nlmsvc_proc_none nlmsvc_proc_null
-#define nlmsvc_proc_test_res nlmsvc_proc_null
-#define nlmsvc_proc_lock_res nlmsvc_proc_null
-#define nlmsvc_proc_cancel_res nlmsvc_proc_null
-#define nlmsvc_proc_unlock_res nlmsvc_proc_null
-
struct nlm_void { int dummy; };
-#define PROC(name, xargt, xrest, argt, rest, respsize) \
- { .pc_func = nlmsvc_proc_##name, \
- .pc_decode = nlmsvc_decode_##xargt, \
- .pc_encode = nlmsvc_encode_##xrest, \
- .pc_release = NULL, \
- .pc_argsize = sizeof(struct nlm_##argt), \
- .pc_ressize = sizeof(struct nlm_##rest), \
- .pc_xdrressize = respsize, \
- }
-
#define Ck (1+XDR_QUADLEN(NLM_MAXCOOKIELEN)) /* cookie */
#define St 1 /* status */
#define No (1+1024/4) /* Net Obj */
#define Rg 2 /* range - offset + size */
-const struct svc_procedure nlmsvc_procedures[] = {
- PROC(null, void, void, void, void, 1),
- PROC(test, testargs, testres, args, res, Ck+St+2+No+Rg),
- PROC(lock, lockargs, res, args, res, Ck+St),
- PROC(cancel, cancargs, res, args, res, Ck+St),
- PROC(unlock, unlockargs, res, args, res, Ck+St),
- PROC(granted, testargs, res, args, res, Ck+St),
- PROC(test_msg, testargs, norep, args, void, 1),
- PROC(lock_msg, lockargs, norep, args, void, 1),
- PROC(cancel_msg, cancargs, norep, args, void, 1),
- PROC(unlock_msg, unlockargs, norep, args, void, 1),
- PROC(granted_msg, testargs, norep, args, void, 1),
- PROC(test_res, testres, norep, res, void, 1),
- PROC(lock_res, lockres, norep, res, void, 1),
- PROC(cancel_res, cancelres, norep, res, void, 1),
- PROC(unlock_res, unlockres, norep, res, void, 1),
- PROC(granted_res, res, norep, res, void, 1),
- /* statd callback */
- PROC(sm_notify, reboot, void, reboot, void, 1),
- PROC(none, void, void, void, void, 1),
- PROC(none, void, void, void, void, 1),
- PROC(none, void, void, void, void, 1),
- PROC(share, shareargs, shareres, args, res, Ck+St+1),
- PROC(unshare, shareargs, shareres, args, res, Ck+St+1),
- PROC(nm_lock, lockargs, res, args, res, Ck+St),
- PROC(free_all, notify, void, args, void, 0),
-
+const struct svc_procedure nlmsvc_procedures[24] = {
+ [NLMPROC_NULL] = {
+ .pc_func = nlmsvc_proc_null,
+ .pc_decode = nlmsvc_decode_void,
+ .pc_encode = nlmsvc_encode_void,
+ .pc_argsize = sizeof(struct nlm_void),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
+ },
+ [NLMPROC_TEST] = {
+ .pc_func = nlmsvc_proc_test,
+ .pc_decode = nlmsvc_decode_testargs,
+ .pc_encode = nlmsvc_encode_testres,
+ .pc_argsize = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_res),
+ .pc_xdrressize = Ck+St+2+No+Rg,
+ },
+ [NLMPROC_LOCK] = {
+ .pc_func = nlmsvc_proc_lock,
+ .pc_decode = nlmsvc_decode_lockargs,
+ .pc_encode = nlmsvc_encode_res,
+ .pc_argsize = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_res),
+ .pc_xdrressize = Ck+St,
+ },
+ [NLMPROC_CANCEL] = {
+ .pc_func = nlmsvc_proc_cancel,
+ .pc_decode = nlmsvc_decode_cancargs,
+ .pc_encode = nlmsvc_encode_res,
+ .pc_argsize = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_res),
+ .pc_xdrressize = Ck+St,
+ },
+ [NLMPROC_UNLOCK] = {
+ .pc_func = nlmsvc_proc_unlock,
+ .pc_decode = nlmsvc_decode_unlockargs,
+ .pc_encode = nlmsvc_encode_res,
+ .pc_argsize = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_res),
+ .pc_xdrressize = Ck+St,
+ },
+ [NLMPROC_GRANTED] = {
+ .pc_func = nlmsvc_proc_granted,
+ .pc_decode = nlmsvc_decode_testargs,
+ .pc_encode = nlmsvc_encode_res,
+ .pc_argsize = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_res),
+ .pc_xdrressize = Ck+St,
+ },
+ [NLMPROC_TEST_MSG] = {
+ .pc_func = nlmsvc_proc_test_msg,
+ .pc_decode = nlmsvc_decode_testargs,
+ .pc_encode = nlmsvc_encode_void,
+ .pc_argsize = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
+ },
+ [NLMPROC_LOCK_MSG] = {
+ .pc_func = nlmsvc_proc_lock_msg,
+ .pc_decode = nlmsvc_decode_lockargs,
+ .pc_encode = nlmsvc_encode_void,
+ .pc_argsize = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
+ },
+ [NLMPROC_CANCEL_MSG] = {
+ .pc_func = nlmsvc_proc_cancel_msg,
+ .pc_decode = nlmsvc_decode_cancargs,
+ .pc_encode = nlmsvc_encode_void,
+ .pc_argsize = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
+ },
+ [NLMPROC_UNLOCK_MSG] = {
+ .pc_func = nlmsvc_proc_unlock_msg,
+ .pc_decode = nlmsvc_decode_unlockargs,
+ .pc_encode = nlmsvc_encode_void,
+ .pc_argsize = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
+ },
+ [NLMPROC_GRANTED_MSG] = {
+ .pc_func = nlmsvc_proc_granted_msg,
+ .pc_decode = nlmsvc_decode_testargs,
+ .pc_encode = nlmsvc_encode_void,
+ .pc_argsize = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
+ },
+ [NLMPROC_TEST_RES] = {
+ .pc_func = nlmsvc_proc_null,
+ .pc_decode = nlmsvc_decode_void,
+ .pc_encode = nlmsvc_encode_void,
+ .pc_argsize = sizeof(struct nlm_res),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
+ },
+ [NLMPROC_LOCK_RES] = {
+ .pc_func = nlmsvc_proc_null,
+ .pc_decode = nlmsvc_decode_void,
+ .pc_encode = nlmsvc_encode_void,
+ .pc_argsize = sizeof(struct nlm_res),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
+ },
+ [NLMPROC_CANCEL_RES] = {
+ .pc_func = nlmsvc_proc_null,
+ .pc_decode = nlmsvc_decode_void,
+ .pc_encode = nlmsvc_encode_void,
+ .pc_argsize = sizeof(struct nlm_res),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
+ },
+ [NLMPROC_UNLOCK_RES] = {
+ .pc_func = nlmsvc_proc_null,
+ .pc_decode = nlmsvc_decode_void,
+ .pc_encode = nlmsvc_encode_void,
+ .pc_argsize = sizeof(struct nlm_res),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
+ },
+ [NLMPROC_GRANTED_RES] = {
+ .pc_func = nlmsvc_proc_granted_res,
+ .pc_decode = nlmsvc_decode_res,
+ .pc_encode = nlmsvc_encode_void,
+ .pc_argsize = sizeof(struct nlm_res),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
+ },
+ [NLMPROC_NSM_NOTIFY] = {
+ .pc_func = nlmsvc_proc_sm_notify,
+ .pc_decode = nlmsvc_decode_reboot,
+ .pc_encode = nlmsvc_encode_void,
+ .pc_argsize = sizeof(struct nlm_reboot),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
+ },
+ [17] = {
+ .pc_func = nlmsvc_proc_unused,
+ .pc_decode = nlmsvc_decode_void,
+ .pc_encode = nlmsvc_encode_void,
+ .pc_argsize = sizeof(struct nlm_void),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
+ },
+ [18] = {
+ .pc_func = nlmsvc_proc_unused,
+ .pc_decode = nlmsvc_decode_void,
+ .pc_encode = nlmsvc_encode_void,
+ .pc_argsize = sizeof(struct nlm_void),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
+ },
+ [19] = {
+ .pc_func = nlmsvc_proc_unused,
+ .pc_decode = nlmsvc_decode_void,
+ .pc_encode = nlmsvc_encode_void,
+ .pc_argsize = sizeof(struct nlm_void),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
+ },
+ [NLMPROC_SHARE] = {
+ .pc_func = nlmsvc_proc_share,
+ .pc_decode = nlmsvc_decode_shareargs,
+ .pc_encode = nlmsvc_encode_shareres,
+ .pc_argsize = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_res),
+ .pc_xdrressize = Ck+St+1,
+ },
+ [NLMPROC_UNSHARE] = {
+ .pc_func = nlmsvc_proc_unshare,
+ .pc_decode = nlmsvc_decode_shareargs,
+ .pc_encode = nlmsvc_encode_shareres,
+ .pc_argsize = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_res),
+ .pc_xdrressize = Ck+St+1,
+ },
+ [NLMPROC_NM_LOCK] = {
+ .pc_func = nlmsvc_proc_nm_lock,
+ .pc_decode = nlmsvc_decode_lockargs,
+ .pc_encode = nlmsvc_encode_res,
+ .pc_argsize = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_res),
+ .pc_xdrressize = Ck+St,
+ },
+ [NLMPROC_FREE_ALL] = {
+ .pc_func = nlmsvc_proc_free_all,
+ .pc_decode = nlmsvc_decode_notify,
+ .pc_encode = nlmsvc_encode_void,
+ .pc_argsize = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = 0,
+ },
};
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 7b09a9158e40..34f546404aa1 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -383,8 +383,7 @@ static int minix_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_files = sbi->s_ninodes;
buf->f_ffree = minix_count_free_inodes(sb);
buf->f_namelen = sbi->s_namelen;
- buf->f_fsid.val[0] = (u32)id;
- buf->f_fsid.val[1] = (u32)(id >> 32);
+ buf->f_fsid = u64_to_fsid(id);
return 0;
}
diff --git a/fs/namei.c b/fs/namei.c
index f1eb8ccd2be9..d4a6dd772303 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1626,7 +1626,8 @@ static const char *pick_link(struct nameidata *nd, struct path *link,
return ERR_PTR(error);
}
- if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS))
+ if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) ||
+ unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW))
return ERR_PTR(-ELOOP);
if (!(nd->flags & LOOKUP_RCU)) {
diff --git a/fs/namespace.c b/fs/namespace.c
index 294e05a13d17..cebaa3e81794 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1191,7 +1191,7 @@ static void mntput_no_expire(struct mount *mnt)
struct task_struct *task = current;
if (likely(!(task->flags & PF_KTHREAD))) {
init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
- if (!task_work_add(task, &mnt->mnt_rcu, true))
+ if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME))
return;
}
if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
@@ -3171,6 +3171,8 @@ int path_mount(const char *dev_name, struct path *path,
mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
if (flags & MS_RDONLY)
mnt_flags |= MNT_READONLY;
+ if (flags & MS_NOSYMFOLLOW)
+ mnt_flags |= MNT_NOSYMFOLLOW;
/* The default atime for remount is preservation */
if ((flags & MS_REMOUNT) &&
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index 222afba70bc0..29ec8b09a52d 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -94,6 +94,7 @@ enum {
static const struct constant_table nfs_param_enums_local_lock[] = {
{ "all", Opt_local_lock_all },
{ "flock", Opt_local_lock_flock },
+ { "posix", Opt_local_lock_posix },
{ "none", Opt_local_lock_none },
{}
};
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 6b063227e34e..2bcbe38afe2e 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -32,9 +32,9 @@ int nfs_mountpoint_expiry_timeout = 500 * HZ;
/*
* nfs_path - reconstruct the path given an arbitrary dentry
* @base - used to return pointer to the end of devname part of path
- * @dentry - pointer to dentry
+ * @dentry_in - pointer to dentry
* @buffer - result buffer
- * @buflen - length of buffer
+ * @buflen_in - length of buffer
* @flags - options (see below)
*
* Helper function for constructing the server pathname
@@ -49,15 +49,19 @@ int nfs_mountpoint_expiry_timeout = 500 * HZ;
* the original device (export) name
* (if unset, the original name is returned verbatim)
*/
-char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen,
- unsigned flags)
+char *nfs_path(char **p, struct dentry *dentry_in, char *buffer,
+ ssize_t buflen_in, unsigned flags)
{
char *end;
int namelen;
unsigned seq;
const char *base;
+ struct dentry *dentry;
+ ssize_t buflen;
rename_retry:
+ buflen = buflen_in;
+ dentry = dentry_in;
end = buffer+buflen;
*--end = '\0';
buflen--;
diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c
index 86777996cfec..b51424ff8159 100644
--- a/fs/nfs/nfs42xattr.c
+++ b/fs/nfs/nfs42xattr.c
@@ -67,7 +67,6 @@ struct nfs4_xattr_bucket {
struct nfs4_xattr_cache {
struct kref ref;
- spinlock_t hash_lock; /* protects hashtable and lru */
struct nfs4_xattr_bucket buckets[NFS4_XATTR_HASH_SIZE];
struct list_head lru;
struct list_head dispose;
@@ -882,7 +881,7 @@ nfs4_xattr_cache_count(struct shrinker *shrink, struct shrink_control *sc)
{
unsigned long count;
- count = list_lru_count(&nfs4_xattr_cache_lru);
+ count = list_lru_shrink_count(&nfs4_xattr_cache_lru, sc);
return vfs_pressure_ratio(count);
}
@@ -976,7 +975,7 @@ nfs4_xattr_entry_count(struct shrinker *shrink, struct shrink_control *sc)
lru = (shrink == &nfs4_xattr_large_entry_shrinker) ?
&nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru;
- count = list_lru_count(lru);
+ count = list_lru_shrink_count(lru, sc);
return vfs_pressure_ratio(count);
}
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index cc50085e151c..0dc31ad2362e 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -45,6 +45,15 @@
#define encode_deallocate_maxsz (op_encode_hdr_maxsz + \
encode_fallocate_maxsz)
#define decode_deallocate_maxsz (op_decode_hdr_maxsz)
+#define encode_read_plus_maxsz (op_encode_hdr_maxsz + \
+ encode_stateid_maxsz + 3)
+#define NFS42_READ_PLUS_SEGMENT_SIZE (1 /* data_content4 */ + \
+ 2 /* data_info4.di_offset */ + \
+ 2 /* data_info4.di_length */)
+#define decode_read_plus_maxsz (op_decode_hdr_maxsz + \
+ 1 /* rpr_eof */ + \
+ 1 /* rpr_contents count */ + \
+ 2 * NFS42_READ_PLUS_SEGMENT_SIZE)
#define encode_seek_maxsz (op_encode_hdr_maxsz + \
encode_stateid_maxsz + \
2 /* offset */ + \
@@ -128,6 +137,14 @@
decode_putfh_maxsz + \
decode_deallocate_maxsz + \
decode_getattr_maxsz)
+#define NFS4_enc_read_plus_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_read_plus_maxsz)
+#define NFS4_dec_read_plus_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_read_plus_maxsz)
#define NFS4_enc_seek_sz (compound_encode_hdr_maxsz + \
encode_sequence_maxsz + \
encode_putfh_maxsz + \
@@ -324,6 +341,16 @@ static void encode_deallocate(struct xdr_stream *xdr,
encode_fallocate(xdr, args);
}
+static void encode_read_plus(struct xdr_stream *xdr,
+ const struct nfs_pgio_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_READ_PLUS, decode_read_plus_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &args->stateid);
+ encode_uint64(xdr, args->offset);
+ encode_uint32(xdr, args->count);
+}
+
static void encode_seek(struct xdr_stream *xdr,
const struct nfs42_seek_args *args,
struct compound_hdr *hdr)
@@ -723,6 +750,28 @@ static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req,
}
/*
+ * Encode READ_PLUS request
+ */
+static void nfs4_xdr_enc_read_plus(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_pgio_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_read_plus(xdr, args, &hdr);
+
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->count, hdr.replen);
+ encode_nops(&hdr);
+}
+
+/*
* Encode SEEK request
*/
static void nfs4_xdr_enc_seek(struct rpc_rqst *req,
@@ -970,6 +1019,97 @@ static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *re
return decode_op_hdr(xdr, OP_DEALLOCATE);
}
+static int decode_read_plus_data(struct xdr_stream *xdr, struct nfs_pgio_res *res,
+ uint32_t *eof)
+{
+ uint32_t count, recvd;
+ uint64_t offset;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 8 + 4);
+ if (unlikely(!p))
+ return -EIO;
+
+ p = xdr_decode_hyper(p, &offset);
+ count = be32_to_cpup(p);
+ recvd = xdr_align_data(xdr, res->count, count);
+ res->count += recvd;
+
+ if (count > recvd) {
+ dprintk("NFS: server cheating in read reply: "
+ "count %u > recvd %u\n", count, recvd);
+ *eof = 0;
+ return 1;
+ }
+
+ return 0;
+}
+
+static int decode_read_plus_hole(struct xdr_stream *xdr, struct nfs_pgio_res *res,
+ uint32_t *eof)
+{
+ uint64_t offset, length, recvd;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 8 + 8);
+ if (unlikely(!p))
+ return -EIO;
+
+ p = xdr_decode_hyper(p, &offset);
+ p = xdr_decode_hyper(p, &length);
+ recvd = xdr_expand_hole(xdr, res->count, length);
+ res->count += recvd;
+
+ if (recvd < length) {
+ *eof = 0;
+ return 1;
+ }
+ return 0;
+}
+
+static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res)
+{
+ uint32_t eof, segments, type;
+ int status, i;
+ __be32 *p;
+
+ status = decode_op_hdr(xdr, OP_READ_PLUS);
+ if (status)
+ return status;
+
+ p = xdr_inline_decode(xdr, 4 + 4);
+ if (unlikely(!p))
+ return -EIO;
+
+ eof = be32_to_cpup(p++);
+ segments = be32_to_cpup(p++);
+ if (segments == 0)
+ goto out;
+
+ for (i = 0; i < segments; i++) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+
+ type = be32_to_cpup(p++);
+ if (type == NFS4_CONTENT_DATA)
+ status = decode_read_plus_data(xdr, res, &eof);
+ else if (type == NFS4_CONTENT_HOLE)
+ status = decode_read_plus_hole(xdr, res, &eof);
+ else
+ return -EINVAL;
+
+ if (status < 0)
+ return status;
+ if (status > 0)
+ break;
+ }
+
+out:
+ res->eof = eof;
+ return 0;
+}
+
static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res)
{
int status;
@@ -1147,6 +1287,33 @@ out:
}
/*
+ * Decode READ_PLUS request
+ */
+static int nfs4_xdr_dec_read_plus(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_pgio_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_read_plus(xdr, res);
+ if (!status)
+ status = res->count;
+out:
+ return status;
+}
+
+/*
* Decode SEEK request
*/
static int nfs4_xdr_dec_seek(struct rpc_rqst *rqstp,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 0c9505dc852c..065cb04222a1 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -599,6 +599,14 @@ static inline bool nfs4_stateid_is_newer(const nfs4_stateid *s1, const nfs4_stat
return (s32)(be32_to_cpu(s1->seqid) - be32_to_cpu(s2->seqid)) > 0;
}
+static inline bool nfs4_stateid_is_next(const nfs4_stateid *s1, const nfs4_stateid *s2)
+{
+ u32 seq1 = be32_to_cpu(s1->seqid);
+ u32 seq2 = be32_to_cpu(s2->seqid);
+
+ return seq2 == seq1 + 1U || (seq2 == 1U && seq1 == 0xffffffffU);
+}
+
static inline bool nfs4_stateid_match_or_older(const nfs4_stateid *dst, const nfs4_stateid *src)
{
return nfs4_stateid_match_other(dst, src) &&
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index daacc78a3d48..be7915c861ce 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -1045,6 +1045,8 @@ static int nfs4_server_common_setup(struct nfs_server *server,
server->caps |= server->nfs_client->cl_mvops->init_caps;
if (server->flags & NFS_MOUNT_NORDIRPLUS)
server->caps &= ~NFS_CAP_READDIRPLUS;
+ if (server->nfs_client->cl_proto == XPRT_TRANSPORT_RDMA)
+ server->caps &= ~NFS_CAP_READ_PLUS;
/*
* Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower
* authentication.
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index fdfc77486ace..9d354de613da 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -9,6 +9,7 @@
#include <linux/falloc.h>
#include <linux/mount.h>
#include <linux/nfs_fs.h>
+#include <linux/nfs_ssc.h>
#include "delegation.h"
#include "internal.h"
#include "iostat.h"
@@ -145,7 +146,8 @@ static ssize_t __nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
/* Only offload copy if superblock is the same */
if (file_in->f_op != &nfs4_file_operations)
return -EXDEV;
- if (!nfs_server_capable(file_inode(file_out), NFS_CAP_COPY))
+ if (!nfs_server_capable(file_inode(file_out), NFS_CAP_COPY) ||
+ !nfs_server_capable(file_inode(file_in), NFS_CAP_COPY))
return -EOPNOTSUPP;
if (file_inode(file_in) == file_inode(file_out))
return -EOPNOTSUPP;
@@ -314,9 +316,8 @@ out:
static int read_name_gen = 1;
#define SSC_READ_NAME_BODY "ssc_read_%d"
-struct file *
-nfs42_ssc_open(struct vfsmount *ss_mnt, struct nfs_fh *src_fh,
- nfs4_stateid *stateid)
+static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt,
+ struct nfs_fh *src_fh, nfs4_stateid *stateid)
{
struct nfs_fattr fattr;
struct file *filep, *res;
@@ -398,14 +399,40 @@ out_filep:
fput(filep);
goto out_free_name;
}
-EXPORT_SYMBOL_GPL(nfs42_ssc_open);
-void nfs42_ssc_close(struct file *filep)
+
+static void __nfs42_ssc_close(struct file *filep)
{
struct nfs_open_context *ctx = nfs_file_open_context(filep);
ctx->state->flags = 0;
}
-EXPORT_SYMBOL_GPL(nfs42_ssc_close);
+
+static const struct nfs4_ssc_client_ops nfs4_ssc_clnt_ops_tbl = {
+ .sco_open = __nfs42_ssc_open,
+ .sco_close = __nfs42_ssc_close,
+};
+
+/**
+ * nfs42_ssc_register_ops - Wrapper to register NFS_V4 ops in nfs_common
+ *
+ * Return values:
+ * None
+ */
+void nfs42_ssc_register_ops(void)
+{
+ nfs42_ssc_register(&nfs4_ssc_clnt_ops_tbl);
+}
+
+/**
+ * nfs42_ssc_unregister_ops - wrapper to un-register NFS_V4 ops in nfs_common
+ *
+ * Return values:
+ * None.
+ */
+void nfs42_ssc_unregister_ops(void)
+{
+ nfs42_ssc_unregister(&nfs4_ssc_clnt_ops_tbl);
+}
#endif /* CONFIG_NFS_V4_2 */
const struct file_operations nfs4_file_operations = {
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 62e6eea5c516..8d8aba305ecc 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -46,6 +46,7 @@
#include <keys/user-type.h>
#include <keys/request_key_auth-type.h>
#include <linux/module.h>
+#include <linux/user_namespace.h>
#include "internal.h"
#include "netns.h"
@@ -69,13 +70,13 @@ struct idmap {
struct rpc_pipe *idmap_pipe;
struct idmap_legacy_upcalldata *idmap_upcall_data;
struct mutex idmap_mutex;
- const struct cred *cred;
+ struct user_namespace *user_ns;
};
static struct user_namespace *idmap_userns(const struct idmap *idmap)
{
- if (idmap && idmap->cred)
- return idmap->cred->user_ns;
+ if (idmap && idmap->user_ns)
+ return idmap->user_ns;
return &init_user_ns;
}
@@ -286,7 +287,7 @@ static struct key *nfs_idmap_request_key(const char *name, size_t namelen,
if (ret < 0)
return ERR_PTR(ret);
- if (!idmap->cred || idmap->cred->user_ns == &init_user_ns)
+ if (!idmap->user_ns || idmap->user_ns == &init_user_ns)
rkey = request_key(&key_type_id_resolver, desc, "");
if (IS_ERR(rkey)) {
mutex_lock(&idmap->idmap_mutex);
@@ -462,7 +463,7 @@ nfs_idmap_new(struct nfs_client *clp)
return -ENOMEM;
mutex_init(&idmap->idmap_mutex);
- idmap->cred = get_cred(clp->cl_rpcclient->cl_cred);
+ idmap->user_ns = get_user_ns(clp->cl_rpcclient->cl_cred->user_ns);
rpc_init_pipe_dir_object(&idmap->idmap_pdo,
&nfs_idmap_pipe_dir_object_ops,
@@ -486,7 +487,7 @@ nfs_idmap_new(struct nfs_client *clp)
err_destroy_pipe:
rpc_destroy_pipe_data(idmap->idmap_pipe);
err:
- put_cred(idmap->cred);
+ get_user_ns(idmap->user_ns);
kfree(idmap);
return error;
}
@@ -503,7 +504,7 @@ nfs_idmap_delete(struct nfs_client *clp)
&clp->cl_rpcclient->cl_pipedir_objects,
&idmap->idmap_pdo);
rpc_destroy_pipe_data(idmap->idmap_pipe);
- put_cred(idmap->cred);
+ put_user_ns(idmap->user_ns);
kfree(idmap);
}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 6e95c85fe395..9e0ca9b2b210 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -63,6 +63,7 @@
#include "callback.h"
#include "pnfs.h"
#include "netns.h"
+#include "sysfs.h"
#include "nfs4idmap.h"
#include "nfs4session.h"
#include "fscache.h"
@@ -70,6 +71,10 @@
#include "nfs4trace.h"
+#ifdef CONFIG_NFS_V4_2
+#include "nfs42.h"
+#endif /* CONFIG_NFS_V4_2 */
+
#define NFSDBG_FACILITY NFSDBG_PROC
#define NFS4_BITMASK_SZ 3
@@ -107,6 +112,9 @@ static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *,
static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *,
const struct cred *, bool);
#endif
+static void nfs4_bitmask_adjust(__u32 *bitmask, struct inode *inode,
+ struct nfs_server *server,
+ struct nfs4_label *label);
#ifdef CONFIG_NFS_V4_SECURITY_LABEL
static inline struct nfs4_label *
@@ -1547,19 +1555,6 @@ static void nfs_state_log_update_open_stateid(struct nfs4_state *state)
wake_up_all(&state->waitq);
}
-static void nfs_state_log_out_of_order_open_stateid(struct nfs4_state *state,
- const nfs4_stateid *stateid)
-{
- u32 state_seqid = be32_to_cpu(state->open_stateid.seqid);
- u32 stateid_seqid = be32_to_cpu(stateid->seqid);
-
- if (stateid_seqid == state_seqid + 1U ||
- (stateid_seqid == 1U && state_seqid == 0xffffffffU))
- nfs_state_log_update_open_stateid(state);
- else
- set_bit(NFS_STATE_CHANGE_WAIT, &state->flags);
-}
-
static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state)
{
struct nfs_client *clp = state->owner->so_server->nfs_client;
@@ -1585,21 +1580,19 @@ static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state)
* i.e. The stateid seqids have to be initialised to 1, and
* are then incremented on every state transition.
*/
-static bool nfs_need_update_open_stateid(struct nfs4_state *state,
+static bool nfs_stateid_is_sequential(struct nfs4_state *state,
const nfs4_stateid *stateid)
{
- if (test_bit(NFS_OPEN_STATE, &state->flags) == 0 ||
- !nfs4_stateid_match_other(stateid, &state->open_stateid)) {
+ if (test_bit(NFS_OPEN_STATE, &state->flags)) {
+ /* The common case - we're updating to a new sequence number */
+ if (nfs4_stateid_match_other(stateid, &state->open_stateid) &&
+ nfs4_stateid_is_next(&state->open_stateid, stateid)) {
+ return true;
+ }
+ } else {
+ /* This is the first OPEN in this generation */
if (stateid->seqid == cpu_to_be32(1))
- nfs_state_log_update_open_stateid(state);
- else
- set_bit(NFS_STATE_CHANGE_WAIT, &state->flags);
- return true;
- }
-
- if (nfs4_stateid_is_newer(stateid, &state->open_stateid)) {
- nfs_state_log_out_of_order_open_stateid(state, stateid);
- return true;
+ return true;
}
return false;
}
@@ -1673,16 +1666,16 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state,
int status = 0;
for (;;) {
- if (!nfs_need_update_open_stateid(state, stateid))
- return;
- if (!test_bit(NFS_STATE_CHANGE_WAIT, &state->flags))
+ if (nfs_stateid_is_sequential(state, stateid))
break;
+
if (status)
break;
/* Rely on seqids for serialisation with NFSv4.0 */
if (!nfs4_has_session(NFS_SERVER(state->inode)->nfs_client))
break;
+ set_bit(NFS_STATE_CHANGE_WAIT, &state->flags);
prepare_to_wait(&state->waitq, &wait, TASK_KILLABLE);
/*
* Ensure we process the state changes in the same order
@@ -1693,6 +1686,7 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state,
spin_unlock(&state->owner->so_lock);
rcu_read_unlock();
trace_nfs4_open_stateid_update_wait(state->inode, stateid, 0);
+
if (!signal_pending(current)) {
if (schedule_timeout(5*HZ) == 0)
status = -EAGAIN;
@@ -3435,7 +3429,8 @@ static bool nfs4_refresh_open_old_stateid(nfs4_stateid *dst,
__be32 seqid_open;
u32 dst_seqid;
bool ret;
- int seq;
+ int seq, status = -EAGAIN;
+ DEFINE_WAIT(wait);
for (;;) {
ret = false;
@@ -3447,15 +3442,41 @@ static bool nfs4_refresh_open_old_stateid(nfs4_stateid *dst,
continue;
break;
}
+
+ write_seqlock(&state->seqlock);
seqid_open = state->open_stateid.seqid;
- if (read_seqretry(&state->seqlock, seq))
- continue;
dst_seqid = be32_to_cpu(dst->seqid);
- if ((s32)(dst_seqid - be32_to_cpu(seqid_open)) >= 0)
- dst->seqid = cpu_to_be32(dst_seqid + 1);
- else
+
+ /* Did another OPEN bump the state's seqid? try again: */
+ if ((s32)(be32_to_cpu(seqid_open) - dst_seqid) > 0) {
dst->seqid = seqid_open;
+ write_sequnlock(&state->seqlock);
+ ret = true;
+ break;
+ }
+
+ /* server says we're behind but we haven't seen the update yet */
+ set_bit(NFS_STATE_CHANGE_WAIT, &state->flags);
+ prepare_to_wait(&state->waitq, &wait, TASK_KILLABLE);
+ write_sequnlock(&state->seqlock);
+ trace_nfs4_close_stateid_update_wait(state->inode, dst, 0);
+
+ if (signal_pending(current))
+ status = -EINTR;
+ else
+ if (schedule_timeout(5*HZ) != 0)
+ status = 0;
+
+ finish_wait(&state->waitq, &wait);
+
+ if (!status)
+ continue;
+ if (status == -EINTR)
+ break;
+
+ /* we slept the whole 5 seconds, we must have lost a seqid */
+ dst->seqid = cpu_to_be32(dst_seqid + 1);
ret = true;
break;
}
@@ -3632,9 +3653,10 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
if (calldata->arg.fmode == 0 || calldata->arg.fmode == FMODE_READ) {
/* Close-to-open cache consistency revalidation */
- if (!nfs4_have_delegation(inode, FMODE_READ))
+ if (!nfs4_have_delegation(inode, FMODE_READ)) {
calldata->arg.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
- else
+ nfs4_bitmask_adjust(calldata->arg.bitmask, inode, NFS_SERVER(inode), NULL);
+ } else
calldata->arg.bitmask = NULL;
}
@@ -5255,28 +5277,60 @@ static bool nfs4_read_stateid_changed(struct rpc_task *task,
return true;
}
-static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
+static bool nfs4_read_plus_not_supported(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
{
+ struct nfs_server *server = NFS_SERVER(hdr->inode);
+ struct rpc_message *msg = &task->tk_msg;
+ if (msg->rpc_proc == &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS] &&
+ server->caps & NFS_CAP_READ_PLUS && task->tk_status == -ENOTSUPP) {
+ server->caps &= ~NFS_CAP_READ_PLUS;
+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
+ rpc_restart_call_prepare(task);
+ return true;
+ }
+ return false;
+}
+
+static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
+{
dprintk("--> %s\n", __func__);
if (!nfs4_sequence_done(task, &hdr->res.seq_res))
return -EAGAIN;
if (nfs4_read_stateid_changed(task, &hdr->args))
return -EAGAIN;
+ if (nfs4_read_plus_not_supported(task, hdr))
+ return -EAGAIN;
if (task->tk_status > 0)
nfs_invalidate_atime(hdr->inode);
return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
nfs4_read_done_cb(task, hdr);
}
+#ifdef CONFIG_NFS_V4_2
+static void nfs42_read_plus_support(struct nfs_server *server, struct rpc_message *msg)
+{
+ if (server->caps & NFS_CAP_READ_PLUS)
+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS];
+ else
+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
+}
+#else
+static void nfs42_read_plus_support(struct nfs_server *server, struct rpc_message *msg)
+{
+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
+}
+#endif /* CONFIG_NFS_V4_2 */
+
static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr,
struct rpc_message *msg)
{
hdr->timestamp = jiffies;
if (!hdr->pgio_done_cb)
hdr->pgio_done_cb = nfs4_read_done_cb;
- msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
+ nfs42_read_plus_support(NFS_SERVER(hdr->inode), msg);
nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0);
}
@@ -5360,6 +5414,38 @@ bool nfs4_write_need_cache_consistency_data(struct nfs_pgio_header *hdr)
return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0;
}
+static void nfs4_bitmask_adjust(__u32 *bitmask, struct inode *inode,
+ struct nfs_server *server,
+ struct nfs4_label *label)
+{
+
+ unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
+
+ if ((cache_validity & NFS_INO_INVALID_DATA) ||
+ (cache_validity & NFS_INO_REVAL_PAGECACHE) ||
+ (cache_validity & NFS_INO_REVAL_FORCED) ||
+ (cache_validity & NFS_INO_INVALID_OTHER))
+ nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, label), inode);
+
+ if (cache_validity & NFS_INO_INVALID_ATIME)
+ bitmask[1] |= FATTR4_WORD1_TIME_ACCESS;
+ if (cache_validity & NFS_INO_INVALID_ACCESS)
+ bitmask[0] |= FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER |
+ FATTR4_WORD1_OWNER_GROUP;
+ if (cache_validity & NFS_INO_INVALID_ACL)
+ bitmask[0] |= FATTR4_WORD0_ACL;
+ if (cache_validity & NFS_INO_INVALID_LABEL)
+ bitmask[2] |= FATTR4_WORD2_SECURITY_LABEL;
+ if (cache_validity & NFS_INO_INVALID_CTIME)
+ bitmask[0] |= FATTR4_WORD0_CHANGE;
+ if (cache_validity & NFS_INO_INVALID_MTIME)
+ bitmask[1] |= FATTR4_WORD1_TIME_MODIFY;
+ if (cache_validity & NFS_INO_INVALID_SIZE)
+ bitmask[0] |= FATTR4_WORD0_SIZE;
+ if (cache_validity & NFS_INO_INVALID_BLOCKS)
+ bitmask[1] |= FATTR4_WORD1_SPACE_USED;
+}
+
static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr,
struct rpc_message *msg,
struct rpc_clnt **clnt)
@@ -5369,8 +5455,10 @@ static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr,
if (!nfs4_write_need_cache_consistency_data(hdr)) {
hdr->args.bitmask = NULL;
hdr->res.fattr = NULL;
- } else
+ } else {
hdr->args.bitmask = server->cache_consistency_bitmask;
+ nfs4_bitmask_adjust(hdr->args.bitmask, hdr->inode, server, NULL);
+ }
if (!hdr->pgio_done_cb)
hdr->pgio_done_cb = nfs4_write_done_cb;
@@ -6006,9 +6094,34 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp,
memcpy(bootverf->data, verf, sizeof(bootverf->data));
}
+static size_t
+nfs4_get_uniquifier(struct nfs_client *clp, char *buf, size_t buflen)
+{
+ struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
+ struct nfs_netns_client *nn_clp = nn->nfs_client;
+ const char *id;
+
+ buf[0] = '\0';
+
+ if (nn_clp) {
+ rcu_read_lock();
+ id = rcu_dereference(nn_clp->identifier);
+ if (id)
+ strscpy(buf, id, buflen);
+ rcu_read_unlock();
+ }
+
+ if (nfs4_client_id_uniquifier[0] != '\0' && buf[0] == '\0')
+ strscpy(buf, nfs4_client_id_uniquifier, buflen);
+
+ return strlen(buf);
+}
+
static int
nfs4_init_nonuniform_client_string(struct nfs_client *clp)
{
+ char buf[NFS4_CLIENT_ID_UNIQ_LEN];
+ size_t buflen;
size_t len;
char *str;
@@ -6022,8 +6135,11 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp)
strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)) +
1;
rcu_read_unlock();
- if (nfs4_client_id_uniquifier[0] != '\0')
- len += strlen(nfs4_client_id_uniquifier) + 1;
+
+ buflen = nfs4_get_uniquifier(clp, buf, sizeof(buf));
+ if (buflen)
+ len += buflen + 1;
+
if (len > NFS4_OPAQUE_LIMIT + 1)
return -EINVAL;
@@ -6037,10 +6153,9 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp)
return -ENOMEM;
rcu_read_lock();
- if (nfs4_client_id_uniquifier[0] != '\0')
+ if (buflen)
scnprintf(str, len, "Linux NFSv4.0 %s/%s/%s",
- clp->cl_rpcclient->cl_nodename,
- nfs4_client_id_uniquifier,
+ clp->cl_rpcclient->cl_nodename, buf,
rpc_peeraddr2str(clp->cl_rpcclient,
RPC_DISPLAY_ADDR));
else
@@ -6055,50 +6170,23 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp)
}
static int
-nfs4_init_uniquifier_client_string(struct nfs_client *clp)
-{
- size_t len;
- char *str;
-
- len = 10 + 10 + 1 + 10 + 1 +
- strlen(nfs4_client_id_uniquifier) + 1 +
- strlen(clp->cl_rpcclient->cl_nodename) + 1;
-
- if (len > NFS4_OPAQUE_LIMIT + 1)
- return -EINVAL;
-
- /*
- * Since this string is allocated at mount time, and held until the
- * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying
- * about a memory-reclaim deadlock.
- */
- str = kmalloc(len, GFP_KERNEL);
- if (!str)
- return -ENOMEM;
-
- scnprintf(str, len, "Linux NFSv%u.%u %s/%s",
- clp->rpc_ops->version, clp->cl_minorversion,
- nfs4_client_id_uniquifier,
- clp->cl_rpcclient->cl_nodename);
- clp->cl_owner_id = str;
- return 0;
-}
-
-static int
nfs4_init_uniform_client_string(struct nfs_client *clp)
{
+ char buf[NFS4_CLIENT_ID_UNIQ_LEN];
+ size_t buflen;
size_t len;
char *str;
if (clp->cl_owner_id != NULL)
return 0;
- if (nfs4_client_id_uniquifier[0] != '\0')
- return nfs4_init_uniquifier_client_string(clp);
-
len = 10 + 10 + 1 + 10 + 1 +
strlen(clp->cl_rpcclient->cl_nodename) + 1;
+ buflen = nfs4_get_uniquifier(clp, buf, sizeof(buf));
+ if (buflen)
+ len += buflen + 1;
+
if (len > NFS4_OPAQUE_LIMIT + 1)
return -EINVAL;
@@ -6111,9 +6199,14 @@ nfs4_init_uniform_client_string(struct nfs_client *clp)
if (!str)
return -ENOMEM;
- scnprintf(str, len, "Linux NFSv%u.%u %s",
- clp->rpc_ops->version, clp->cl_minorversion,
- clp->cl_rpcclient->cl_nodename);
+ if (buflen)
+ scnprintf(str, len, "Linux NFSv%u.%u %s/%s",
+ clp->rpc_ops->version, clp->cl_minorversion,
+ buf, clp->cl_rpcclient->cl_nodename);
+ else
+ scnprintf(str, len, "Linux NFSv%u.%u %s",
+ clp->rpc_ops->version, clp->cl_minorversion,
+ clp->cl_rpcclient->cl_nodename);
clp->cl_owner_id = str;
return 0;
}
@@ -6406,6 +6499,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred,
data->args.fhandle = &data->fh;
data->args.stateid = &data->stateid;
data->args.bitmask = server->cache_consistency_bitmask;
+ nfs4_bitmask_adjust(data->args.bitmask, inode, server, NULL);
nfs_copy_fh(&data->fh, NFS_FH(inode));
nfs4_stateid_copy(&data->stateid, stateid);
data->res.fattr = &data->fattr;
@@ -7440,7 +7534,7 @@ nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len)
if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) {
len = security_inode_listsecurity(inode, list, list_len);
- if (list_len && len > list_len)
+ if (len >= 0 && list_len && len > list_len)
return -ERANGE;
}
return len;
@@ -8039,9 +8133,11 @@ int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name,
* both PNFS and NON_PNFS flags set, and not having one of NON_PNFS, PNFS, or
* DS flags set.
*/
-static int nfs4_check_cl_exchange_flags(u32 flags)
+static int nfs4_check_cl_exchange_flags(u32 flags, u32 version)
{
- if (flags & ~EXCHGID4_FLAG_MASK_R)
+ if (version >= 2 && (flags & ~EXCHGID4_2_FLAG_MASK_R))
+ goto out_inval;
+ else if (version < 2 && (flags & ~EXCHGID4_FLAG_MASK_R))
goto out_inval;
if ((flags & EXCHGID4_FLAG_USE_PNFS_MDS) &&
(flags & EXCHGID4_FLAG_USE_NON_PNFS))
@@ -8454,7 +8550,8 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cre
if (status != 0)
goto out;
- status = nfs4_check_cl_exchange_flags(resp->flags);
+ status = nfs4_check_cl_exchange_flags(resp->flags,
+ clp->cl_mvops->minor_version);
if (status != 0)
goto out;
@@ -9693,7 +9790,6 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
.rpc_argp = &args,
.rpc_resp = &res,
};
- struct rpc_clnt *clnt = server->client;
struct nfs4_call_sync_data data = {
.seq_server = server,
.seq_args = &args.seq_args,
@@ -9710,8 +9806,7 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
int status;
if (use_integrity) {
- clnt = server->nfs_client->cl_rpcclient;
- task_setup.rpc_client = clnt;
+ task_setup.rpc_client = server->nfs_client->cl_rpcclient;
cred = nfs4_get_clid_cred(server->nfs_client);
msg.rpc_cred = cred;
@@ -10165,7 +10260,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
| NFS_CAP_SEEK
| NFS_CAP_LAYOUTSTATS
| NFS_CAP_CLONE
- | NFS_CAP_LAYOUTERROR,
+ | NFS_CAP_LAYOUTERROR
+ | NFS_CAP_READ_PLUS,
.init_client = nfs41_init_client,
.shutdown_client = nfs41_shutdown_client,
.match_stateid = nfs41_match_stateid,
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index 0c1ab846b83d..93f5c1678ec2 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -7,6 +7,7 @@
#include <linux/mount.h>
#include <linux/nfs4_mount.h>
#include <linux/nfs_fs.h>
+#include <linux/nfs_ssc.h>
#include "delegation.h"
#include "internal.h"
#include "nfs4_fs.h"
@@ -279,6 +280,9 @@ static int __init init_nfs_v4(void)
if (err)
goto out2;
+#ifdef CONFIG_NFS_V4_2
+ nfs42_ssc_register_ops();
+#endif
register_nfs_version(&nfs_v4);
return 0;
out2:
@@ -297,6 +301,7 @@ static void __exit exit_nfs_v4(void)
unregister_nfs_version(&nfs_v4);
#ifdef CONFIG_NFS_V4_2
nfs4_xattr_cache_exit();
+ nfs42_ssc_unregister_ops();
#endif
nfs4_unregister_sysctl();
nfs_idmap_quit();
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index b4f852d4d099..484c1da96dea 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -1511,6 +1511,7 @@ DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_setattr);
DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_delegreturn);
DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_open_stateid_update);
DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_open_stateid_update_wait);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_close_stateid_update_wait);
DECLARE_EVENT_CLASS(nfs4_getattr_event,
TP_PROTO(
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 0b3510f62623..c6dbfcae7517 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -5308,7 +5308,6 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
uint32_t attrlen,
bitmap[3] = {0};
int status;
- unsigned int pg_offset;
res->acl_len = 0;
if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -5316,9 +5315,6 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
xdr_enter_page(xdr, xdr->buf->page_len);
- /* Calculate the offset of the page data */
- pg_offset = xdr->buf->head[0].iov_len;
-
if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
goto out;
if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
@@ -5331,7 +5327,7 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
/* The bitmap (xdr len + bitmaps) and the attr xdr len words
* are stored with the acl data to handle the problem of
* variable length bitmaps.*/
- res->acl_data_offset = xdr_stream_pos(xdr) - pg_offset;
+ res->acl_data_offset = xdr_page_pos(xdr);
res->acl_len = attrlen;
/* Check for receive buffer overflow */
@@ -7619,6 +7615,7 @@ const struct rpc_procinfo nfs4_procedures[] = {
PROC42(SETXATTR, enc_setxattr, dec_setxattr),
PROC42(LISTXATTRS, enc_listxattrs, dec_listxattrs),
PROC42(REMOVEXATTR, enc_removexattr, dec_removexattr),
+ PROC42(READ_PLUS, enc_read_plus, dec_read_plus),
};
static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)];
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 71f7741126b6..0e50b9d45c32 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -902,7 +902,7 @@ restart:
}
/*
- * Called by the state manger to remove all layouts established under an
+ * Called by the state manager to remove all layouts established under an
* expired lease.
*/
void
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f943e37853fa..4034102010f0 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -57,6 +57,7 @@
#include <linux/rcupdate.h>
#include <linux/uaccess.h>
+#include <linux/nfs_ssc.h>
#include "nfs4_fs.h"
#include "callback.h"
@@ -85,6 +86,10 @@ const struct super_operations nfs_sops = {
};
EXPORT_SYMBOL_GPL(nfs_sops);
+static const struct nfs_ssc_client_ops nfs_ssc_clnt_ops_tbl = {
+ .sco_sb_deactive = nfs_sb_deactive,
+};
+
#if IS_ENABLED(CONFIG_NFS_V4)
static int __init register_nfs4_fs(void)
{
@@ -106,6 +111,16 @@ static void unregister_nfs4_fs(void)
}
#endif
+static void nfs_ssc_register_ops(void)
+{
+ nfs_ssc_register(&nfs_ssc_clnt_ops_tbl);
+}
+
+static void nfs_ssc_unregister_ops(void)
+{
+ nfs_ssc_unregister(&nfs_ssc_clnt_ops_tbl);
+}
+
static struct shrinker acl_shrinker = {
.count_objects = nfs_access_cache_count,
.scan_objects = nfs_access_cache_scan,
@@ -133,6 +148,7 @@ int __init register_nfs_fs(void)
ret = register_shrinker(&acl_shrinker);
if (ret < 0)
goto error_3;
+ nfs_ssc_register_ops();
return 0;
error_3:
nfs_unregister_sysctl();
@@ -152,6 +168,7 @@ void __exit unregister_nfs_fs(void)
unregister_shrinker(&acl_shrinker);
nfs_unregister_sysctl();
unregister_nfs4_fs();
+ nfs_ssc_unregister_ops();
unregister_filesystem(&nfs_fs_type);
}
@@ -889,7 +906,7 @@ static struct nfs_server *nfs_try_mount_request(struct fs_context *fc)
default:
if (rpcauth_get_gssinfo(flavor, &info) != 0)
continue;
- /* Fallthrough */
+ break;
}
dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor);
ctx->selected_flavor = flavor;
diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c
index c489496b5659..8cb70755e3c9 100644
--- a/fs/nfs/sysfs.c
+++ b/fs/nfs/sysfs.c
@@ -79,7 +79,12 @@ static ssize_t nfs_netns_identifier_show(struct kobject *kobj,
struct nfs_netns_client *c = container_of(kobj,
struct nfs_netns_client,
kobject);
- return scnprintf(buf, PAGE_SIZE, "%s\n", c->identifier);
+ ssize_t ret;
+
+ rcu_read_lock();
+ ret = scnprintf(buf, PAGE_SIZE, "%s\n", rcu_dereference(c->identifier));
+ rcu_read_unlock();
+ return ret;
}
/* Strip trailing '\n' */
@@ -107,7 +112,7 @@ static ssize_t nfs_netns_identifier_store(struct kobject *kobj,
p = kmemdup_nul(buf, len, GFP_KERNEL);
if (!p)
return -ENOMEM;
- old = xchg(&c->identifier, p);
+ old = rcu_dereference_protected(xchg(&c->identifier, (char __rcu *)p), 1);
if (old) {
synchronize_rcu();
kfree(old);
@@ -121,7 +126,7 @@ static void nfs_netns_client_release(struct kobject *kobj)
struct nfs_netns_client,
kobject);
- kfree(c->identifier);
+ kfree(rcu_dereference_raw(c->identifier));
kfree(c);
}
diff --git a/fs/nfs/sysfs.h b/fs/nfs/sysfs.h
index ebcbdc40483b..5501ef573c32 100644
--- a/fs/nfs/sysfs.h
+++ b/fs/nfs/sysfs.h
@@ -11,7 +11,7 @@
struct nfs_netns_client {
struct kobject kobject;
struct net *net;
- const char *identifier;
+ const char __rcu *identifier;
};
extern struct kobject *nfs_client_kobj;
diff --git a/fs/nfs_common/Makefile b/fs/nfs_common/Makefile
index 4bebe834c009..fa82f5aaa6d9 100644
--- a/fs/nfs_common/Makefile
+++ b/fs/nfs_common/Makefile
@@ -7,3 +7,4 @@ obj-$(CONFIG_NFS_ACL_SUPPORT) += nfs_acl.o
nfs_acl-objs := nfsacl.o
obj-$(CONFIG_GRACE_PERIOD) += grace.o
+obj-$(CONFIG_GRACE_PERIOD) += nfs_ssc.o
diff --git a/fs/nfs_common/nfs_ssc.c b/fs/nfs_common/nfs_ssc.c
new file mode 100644
index 000000000000..f43bbb373913
--- /dev/null
+++ b/fs/nfs_common/nfs_ssc.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * fs/nfs_common/nfs_ssc_comm.c
+ *
+ * Helper for knfsd's SSC to access ops in NFS client modules
+ *
+ * Author: Dai Ngo <dai.ngo@oracle.com>
+ *
+ * Copyright (c) 2020, Oracle and/or its affiliates.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/nfs_ssc.h>
+#include "../nfs/nfs4_fs.h"
+
+MODULE_LICENSE("GPL");
+
+struct nfs_ssc_client_ops_tbl nfs_ssc_client_tbl;
+EXPORT_SYMBOL_GPL(nfs_ssc_client_tbl);
+
+#ifdef CONFIG_NFS_V4_2
+/**
+ * nfs42_ssc_register - install the NFS_V4 client ops in the nfs_ssc_client_tbl
+ * @ops: NFS_V4 ops to be installed
+ *
+ * Return values:
+ * None
+ */
+void nfs42_ssc_register(const struct nfs4_ssc_client_ops *ops)
+{
+ nfs_ssc_client_tbl.ssc_nfs4_ops = ops;
+}
+EXPORT_SYMBOL_GPL(nfs42_ssc_register);
+
+/**
+ * nfs42_ssc_unregister - uninstall the NFS_V4 client ops from
+ * the nfs_ssc_client_tbl
+ * @ops: ops to be uninstalled
+ *
+ * Return values:
+ * None
+ */
+void nfs42_ssc_unregister(const struct nfs4_ssc_client_ops *ops)
+{
+ if (nfs_ssc_client_tbl.ssc_nfs4_ops != ops)
+ return;
+
+ nfs_ssc_client_tbl.ssc_nfs4_ops = NULL;
+}
+EXPORT_SYMBOL_GPL(nfs42_ssc_unregister);
+#endif /* CONFIG_NFS_V4_2 */
+
+#ifdef CONFIG_NFS_V4_2
+/**
+ * nfs_ssc_register - install the NFS_FS client ops in the nfs_ssc_client_tbl
+ * @ops: NFS_FS ops to be installed
+ *
+ * Return values:
+ * None
+ */
+void nfs_ssc_register(const struct nfs_ssc_client_ops *ops)
+{
+ nfs_ssc_client_tbl.ssc_nfs_ops = ops;
+}
+EXPORT_SYMBOL_GPL(nfs_ssc_register);
+
+/**
+ * nfs_ssc_unregister - uninstall the NFS_FS client ops from
+ * the nfs_ssc_client_tbl
+ * @ops: ops to be uninstalled
+ *
+ * Return values:
+ * None
+ */
+void nfs_ssc_unregister(const struct nfs_ssc_client_ops *ops)
+{
+ if (nfs_ssc_client_tbl.ssc_nfs_ops != ops)
+ return;
+ nfs_ssc_client_tbl.ssc_nfs_ops = NULL;
+}
+EXPORT_SYMBOL_GPL(nfs_ssc_unregister);
+
+#else
+void nfs_ssc_register(const struct nfs_ssc_client_ops *ops)
+{
+}
+EXPORT_SYMBOL_GPL(nfs_ssc_register);
+
+void nfs_ssc_unregister(const struct nfs_ssc_client_ops *ops)
+{
+}
+EXPORT_SYMBOL_GPL(nfs_ssc_unregister);
+#endif /* CONFIG_NFS_V4_2 */
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 99d2cae91bd6..dbbc583d6273 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -136,7 +136,7 @@ config NFSD_FLEXFILELAYOUT
config NFSD_V4_2_INTER_SSC
bool "NFSv4.2 inter server to server COPY"
- depends on NFSD_V4 && NFS_V4_1 && NFS_V4_2 && NFS_FS=y
+ depends on NFSD_V4 && NFS_V4_1 && NFS_V4_2
help
This option enables support for NFSv4.2 inter server to
server copy where the destination server calls the NFSv4.2
@@ -156,13 +156,3 @@ config NFSD_V4_SECURITY_LABEL
If you do not wish to enable fine-grained security labels SELinux or
Smack policies on NFSv4 files, say N.
-
-config NFSD_FAULT_INJECTION
- bool "NFS server manual fault injection"
- depends on NFSD_V4 && DEBUG_KERNEL && DEBUG_FS && BROKEN
- help
- This option enables support for manually injecting faults
- into the NFS server. This is intended to be used for
- testing error recovery on the NFS client.
-
- If unsure, say N.
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 6a40b1afe703..3f0983e93a99 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -13,7 +13,6 @@ nfsd-y += trace.o
nfsd-y += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
export.o auth.o lockd.o nfscache.o nfsxdr.o \
stats.o filecache.o
-nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o
nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o
nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index cb777fe82988..21e404e7cb68 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1002,7 +1002,7 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
if (nfsd4_spo_must_allow(rqstp))
return 0;
- return nfserr_wrongsec;
+ return rqstp->rq_vers < 4 ? nfserr_acces : nfserr_wrongsec;
}
/*
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index c8b9d2667ee6..3c6c2f7d1688 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -889,7 +889,7 @@ nfsd_file_find_locked(struct inode *inode, unsigned int may_flags,
hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head,
nf_node, lockdep_is_held(&nfsd_file_hashtbl[hashval].nfb_lock)) {
- if ((need & nf->nf_may) != need)
+ if (nf->nf_may != need)
continue;
if (nf->nf_inode != inode)
continue;
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index cbab1d2d8a75..6a900f770dd2 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -14,7 +14,6 @@
#include "vfs.h"
#define NFSDDBG_FACILITY NFSDDBG_PROC
-#define RETURN_STATUS(st) { resp->status = (st); return (st); }
/*
* NULL call.
@@ -22,7 +21,7 @@
static __be32
nfsacld_proc_null(struct svc_rqst *rqstp)
{
- return nfs_ok;
+ return rpc_success;
}
/*
@@ -35,24 +34,25 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst *rqstp)
struct posix_acl *acl;
struct inode *inode;
svc_fh *fh;
- __be32 nfserr = 0;
dprintk("nfsd: GETACL(2acl) %s\n", SVCFH_fmt(&argp->fh));
fh = fh_copy(&resp->fh, &argp->fh);
- nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
- if (nfserr)
- RETURN_STATUS(nfserr);
+ resp->status = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
+ if (resp->status != nfs_ok)
+ goto out;
inode = d_inode(fh->fh_dentry);
- if (argp->mask & ~NFS_ACL_MASK)
- RETURN_STATUS(nfserr_inval);
+ if (argp->mask & ~NFS_ACL_MASK) {
+ resp->status = nfserr_inval;
+ goto out;
+ }
resp->mask = argp->mask;
- nfserr = fh_getattr(fh, &resp->stat);
- if (nfserr)
- RETURN_STATUS(nfserr);
+ resp->status = fh_getattr(fh, &resp->stat);
+ if (resp->status != nfs_ok)
+ goto out;
if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
acl = get_acl(inode, ACL_TYPE_ACCESS);
@@ -61,7 +61,7 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst *rqstp)
acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
}
if (IS_ERR(acl)) {
- nfserr = nfserrno(PTR_ERR(acl));
+ resp->status = nfserrno(PTR_ERR(acl));
goto fail;
}
resp->acl_access = acl;
@@ -71,19 +71,20 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst *rqstp)
of a non-directory! */
acl = get_acl(inode, ACL_TYPE_DEFAULT);
if (IS_ERR(acl)) {
- nfserr = nfserrno(PTR_ERR(acl));
+ resp->status = nfserrno(PTR_ERR(acl));
goto fail;
}
resp->acl_default = acl;
}
/* resp->acl_{access,default} are released in nfssvc_release_getacl. */
- RETURN_STATUS(0);
+out:
+ return rpc_success;
fail:
posix_acl_release(resp->acl_access);
posix_acl_release(resp->acl_default);
- RETURN_STATUS(nfserr);
+ goto out;
}
/*
@@ -95,14 +96,13 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst *rqstp)
struct nfsd_attrstat *resp = rqstp->rq_resp;
struct inode *inode;
svc_fh *fh;
- __be32 nfserr = 0;
int error;
dprintk("nfsd: SETACL(2acl) %s\n", SVCFH_fmt(&argp->fh));
fh = fh_copy(&resp->fh, &argp->fh);
- nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR);
- if (nfserr)
+ resp->status = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR);
+ if (resp->status != nfs_ok)
goto out;
inode = d_inode(fh->fh_dentry);
@@ -124,19 +124,20 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst *rqstp)
fh_drop_write(fh);
- nfserr = fh_getattr(fh, &resp->stat);
+ resp->status = fh_getattr(fh, &resp->stat);
out:
/* argp->acl_{access,default} may have been allocated in
nfssvc_decode_setaclargs. */
posix_acl_release(argp->acl_access);
posix_acl_release(argp->acl_default);
- return nfserr;
+ return rpc_success;
+
out_drop_lock:
fh_unlock(fh);
fh_drop_write(fh);
out_errno:
- nfserr = nfserrno(error);
+ resp->status = nfserrno(error);
goto out;
}
@@ -147,15 +148,16 @@ static __be32 nfsacld_proc_getattr(struct svc_rqst *rqstp)
{
struct nfsd_fhandle *argp = rqstp->rq_argp;
struct nfsd_attrstat *resp = rqstp->rq_resp;
- __be32 nfserr;
+
dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh));
fh_copy(&resp->fh, &argp->fh);
- nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
- if (nfserr)
- return nfserr;
- nfserr = fh_getattr(&resp->fh, &resp->stat);
- return nfserr;
+ resp->status = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
+ if (resp->status != nfs_ok)
+ goto out;
+ resp->status = fh_getattr(&resp->fh, &resp->stat);
+out:
+ return rpc_success;
}
/*
@@ -165,7 +167,6 @@ static __be32 nfsacld_proc_access(struct svc_rqst *rqstp)
{
struct nfsd3_accessargs *argp = rqstp->rq_argp;
struct nfsd3_accessres *resp = rqstp->rq_resp;
- __be32 nfserr;
dprintk("nfsd: ACCESS(2acl) %s 0x%x\n",
SVCFH_fmt(&argp->fh),
@@ -173,16 +174,22 @@ static __be32 nfsacld_proc_access(struct svc_rqst *rqstp)
fh_copy(&resp->fh, &argp->fh);
resp->access = argp->access;
- nfserr = nfsd_access(rqstp, &resp->fh, &resp->access, NULL);
- if (nfserr)
- return nfserr;
- nfserr = fh_getattr(&resp->fh, &resp->stat);
- return nfserr;
+ resp->status = nfsd_access(rqstp, &resp->fh, &resp->access, NULL);
+ if (resp->status != nfs_ok)
+ goto out;
+ resp->status = fh_getattr(&resp->fh, &resp->stat);
+out:
+ return rpc_success;
}
/*
* XDR decode functions
*/
+static int nfsaclsvc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p)
+{
+ return 1;
+}
+
static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd3_getaclargs *argp = rqstp->rq_argp;
@@ -268,6 +275,10 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p)
int n;
int w;
+ *p++ = resp->status;
+ if (resp->status != nfs_ok)
+ return xdr_ressize_check(rqstp, p);
+
/*
* Since this is version 2, the check for nfserr in
* nfsd_dispatch actually ensures the following cannot happen.
@@ -307,7 +318,12 @@ static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd_attrstat *resp = rqstp->rq_resp;
+ *p++ = resp->status;
+ if (resp->status != nfs_ok)
+ goto out;
+
p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat);
+out:
return xdr_ressize_check(rqstp, p);
}
@@ -316,8 +332,13 @@ static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd3_accessres *resp = rqstp->rq_resp;
+ *p++ = resp->status;
+ if (resp->status != nfs_ok)
+ goto out;
+
p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat);
*p++ = htonl(resp->access);
+out:
return xdr_ressize_check(rqstp, p);
}
@@ -347,36 +368,63 @@ static void nfsaclsvc_release_access(struct svc_rqst *rqstp)
fh_put(&resp->fh);
}
-#define nfsaclsvc_decode_voidargs NULL
-#define nfsaclsvc_release_void NULL
-#define nfsd3_fhandleargs nfsd_fhandle
-#define nfsd3_attrstatres nfsd_attrstat
-#define nfsd3_voidres nfsd3_voidargs
struct nfsd3_voidargs { int dummy; };
-#define PROC(name, argt, rest, relt, cache, respsize) \
-{ \
- .pc_func = nfsacld_proc_##name, \
- .pc_decode = nfsaclsvc_decode_##argt##args, \
- .pc_encode = nfsaclsvc_encode_##rest##res, \
- .pc_release = nfsaclsvc_release_##relt, \
- .pc_argsize = sizeof(struct nfsd3_##argt##args), \
- .pc_ressize = sizeof(struct nfsd3_##rest##res), \
- .pc_cachetype = cache, \
- .pc_xdrressize = respsize, \
-}
-
#define ST 1 /* status*/
#define AT 21 /* attributes */
#define pAT (1+AT) /* post attributes - conditional */
#define ACL (1+NFS_ACL_MAX_ENTRIES*3) /* Access Control List */
-static const struct svc_procedure nfsd_acl_procedures2[] = {
- PROC(null, void, void, void, RC_NOCACHE, ST),
- PROC(getacl, getacl, getacl, getacl, RC_NOCACHE, ST+1+2*(1+ACL)),
- PROC(setacl, setacl, attrstat, attrstat, RC_NOCACHE, ST+AT),
- PROC(getattr, fhandle, attrstat, attrstat, RC_NOCACHE, ST+AT),
- PROC(access, access, access, access, RC_NOCACHE, ST+AT+1),
+static const struct svc_procedure nfsd_acl_procedures2[5] = {
+ [ACLPROC2_NULL] = {
+ .pc_func = nfsacld_proc_null,
+ .pc_decode = nfsaclsvc_decode_voidarg,
+ .pc_encode = nfsaclsvc_encode_voidres,
+ .pc_argsize = sizeof(struct nfsd3_voidargs),
+ .pc_ressize = sizeof(struct nfsd3_voidargs),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST,
+ },
+ [ACLPROC2_GETACL] = {
+ .pc_func = nfsacld_proc_getacl,
+ .pc_decode = nfsaclsvc_decode_getaclargs,
+ .pc_encode = nfsaclsvc_encode_getaclres,
+ .pc_release = nfsaclsvc_release_getacl,
+ .pc_argsize = sizeof(struct nfsd3_getaclargs),
+ .pc_ressize = sizeof(struct nfsd3_getaclres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+1+2*(1+ACL),
+ },
+ [ACLPROC2_SETACL] = {
+ .pc_func = nfsacld_proc_setacl,
+ .pc_decode = nfsaclsvc_decode_setaclargs,
+ .pc_encode = nfsaclsvc_encode_attrstatres,
+ .pc_release = nfsaclsvc_release_attrstat,
+ .pc_argsize = sizeof(struct nfsd3_setaclargs),
+ .pc_ressize = sizeof(struct nfsd_attrstat),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+AT,
+ },
+ [ACLPROC2_GETATTR] = {
+ .pc_func = nfsacld_proc_getattr,
+ .pc_decode = nfsaclsvc_decode_fhandleargs,
+ .pc_encode = nfsaclsvc_encode_attrstatres,
+ .pc_release = nfsaclsvc_release_attrstat,
+ .pc_argsize = sizeof(struct nfsd_fhandle),
+ .pc_ressize = sizeof(struct nfsd_attrstat),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+AT,
+ },
+ [ACLPROC2_ACCESS] = {
+ .pc_func = nfsacld_proc_access,
+ .pc_decode = nfsaclsvc_decode_accessargs,
+ .pc_encode = nfsaclsvc_encode_accessres,
+ .pc_release = nfsaclsvc_release_access,
+ .pc_argsize = sizeof(struct nfsd3_accessargs),
+ .pc_ressize = sizeof(struct nfsd3_accessres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+AT+1,
+ },
};
static unsigned int nfsd_acl_count2[ARRAY_SIZE(nfsd_acl_procedures2)];
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 13bca4a2f89d..34a394e50e1d 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -13,15 +13,13 @@
#include "xdr3.h"
#include "vfs.h"
-#define RETURN_STATUS(st) { resp->status = (st); return (st); }
-
/*
* NULL call.
*/
static __be32
nfsd3_proc_null(struct svc_rqst *rqstp)
{
- return nfs_ok;
+ return rpc_success;
}
/*
@@ -34,17 +32,18 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst *rqstp)
struct posix_acl *acl;
struct inode *inode;
svc_fh *fh;
- __be32 nfserr = 0;
fh = fh_copy(&resp->fh, &argp->fh);
- nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
- if (nfserr)
- RETURN_STATUS(nfserr);
+ resp->status = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
+ if (resp->status != nfs_ok)
+ goto out;
inode = d_inode(fh->fh_dentry);
- if (argp->mask & ~NFS_ACL_MASK)
- RETURN_STATUS(nfserr_inval);
+ if (argp->mask & ~NFS_ACL_MASK) {
+ resp->status = nfserr_inval;
+ goto out;
+ }
resp->mask = argp->mask;
if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
@@ -54,7 +53,7 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst *rqstp)
acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
}
if (IS_ERR(acl)) {
- nfserr = nfserrno(PTR_ERR(acl));
+ resp->status = nfserrno(PTR_ERR(acl));
goto fail;
}
resp->acl_access = acl;
@@ -64,19 +63,20 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst *rqstp)
of a non-directory! */
acl = get_acl(inode, ACL_TYPE_DEFAULT);
if (IS_ERR(acl)) {
- nfserr = nfserrno(PTR_ERR(acl));
+ resp->status = nfserrno(PTR_ERR(acl));
goto fail;
}
resp->acl_default = acl;
}
/* resp->acl_{access,default} are released in nfs3svc_release_getacl. */
- RETURN_STATUS(0);
+out:
+ return rpc_success;
fail:
posix_acl_release(resp->acl_access);
posix_acl_release(resp->acl_default);
- RETURN_STATUS(nfserr);
+ goto out;
}
/*
@@ -88,12 +88,11 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst *rqstp)
struct nfsd3_attrstat *resp = rqstp->rq_resp;
struct inode *inode;
svc_fh *fh;
- __be32 nfserr = 0;
int error;
fh = fh_copy(&resp->fh, &argp->fh);
- nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR);
- if (nfserr)
+ resp->status = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR);
+ if (resp->status != nfs_ok)
goto out;
inode = d_inode(fh->fh_dentry);
@@ -113,13 +112,13 @@ out_drop_lock:
fh_unlock(fh);
fh_drop_write(fh);
out_errno:
- nfserr = nfserrno(error);
+ resp->status = nfserrno(error);
out:
/* argp->acl_{access,default} may have been allocated in
nfs3svc_decode_setaclargs. */
posix_acl_release(argp->acl_access);
posix_acl_release(argp->acl_default);
- RETURN_STATUS(nfserr);
+ return rpc_success;
}
/*
@@ -174,6 +173,7 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p)
struct nfsd3_getaclres *resp = rqstp->rq_resp;
struct dentry *dentry = resp->fh.fh_dentry;
+ *p++ = resp->status;
p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh);
if (resp->status == 0 && dentry && d_really_is_positive(dentry)) {
struct inode *inode = d_inode(dentry);
@@ -218,8 +218,8 @@ static int nfs3svc_encode_setaclres(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd3_attrstat *resp = rqstp->rq_resp;
+ *p++ = resp->status;
p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh);
-
return xdr_ressize_check(rqstp, p);
}
@@ -235,33 +235,43 @@ static void nfs3svc_release_getacl(struct svc_rqst *rqstp)
posix_acl_release(resp->acl_default);
}
-#define nfs3svc_decode_voidargs NULL
-#define nfs3svc_release_void NULL
-#define nfsd3_setaclres nfsd3_attrstat
-#define nfsd3_voidres nfsd3_voidargs
struct nfsd3_voidargs { int dummy; };
-#define PROC(name, argt, rest, relt, cache, respsize) \
-{ \
- .pc_func = nfsd3_proc_##name, \
- .pc_decode = nfs3svc_decode_##argt##args, \
- .pc_encode = nfs3svc_encode_##rest##res, \
- .pc_release = nfs3svc_release_##relt, \
- .pc_argsize = sizeof(struct nfsd3_##argt##args), \
- .pc_ressize = sizeof(struct nfsd3_##rest##res), \
- .pc_cachetype = cache, \
- .pc_xdrressize = respsize, \
-}
-
#define ST 1 /* status*/
#define AT 21 /* attributes */
#define pAT (1+AT) /* post attributes - conditional */
#define ACL (1+NFS_ACL_MAX_ENTRIES*3) /* Access Control List */
-static const struct svc_procedure nfsd_acl_procedures3[] = {
- PROC(null, void, void, void, RC_NOCACHE, ST),
- PROC(getacl, getacl, getacl, getacl, RC_NOCACHE, ST+1+2*(1+ACL)),
- PROC(setacl, setacl, setacl, fhandle, RC_NOCACHE, ST+pAT),
+static const struct svc_procedure nfsd_acl_procedures3[3] = {
+ [ACLPROC3_NULL] = {
+ .pc_func = nfsd3_proc_null,
+ .pc_decode = nfs3svc_decode_voidarg,
+ .pc_encode = nfs3svc_encode_voidres,
+ .pc_argsize = sizeof(struct nfsd3_voidargs),
+ .pc_ressize = sizeof(struct nfsd3_voidargs),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST,
+ },
+ [ACLPROC3_GETACL] = {
+ .pc_func = nfsd3_proc_getacl,
+ .pc_decode = nfs3svc_decode_getaclargs,
+ .pc_encode = nfs3svc_encode_getaclres,
+ .pc_release = nfs3svc_release_getacl,
+ .pc_argsize = sizeof(struct nfsd3_getaclargs),
+ .pc_ressize = sizeof(struct nfsd3_getaclres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+1+2*(1+ACL),
+ },
+ [ACLPROC3_SETACL] = {
+ .pc_func = nfsd3_proc_setacl,
+ .pc_decode = nfs3svc_decode_setaclargs,
+ .pc_encode = nfs3svc_encode_setaclres,
+ .pc_release = nfs3svc_release_fhandle,
+ .pc_argsize = sizeof(struct nfsd3_setaclargs),
+ .pc_ressize = sizeof(struct nfsd3_attrstat),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+pAT,
+ },
};
static unsigned int nfsd_acl_count3[ARRAY_SIZE(nfsd_acl_procedures3)];
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 288bc76b4574..14468613d150 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -15,8 +15,6 @@
#define NFSDDBG_FACILITY NFSDDBG_PROC
-#define RETURN_STATUS(st) { resp->status = (st); return (st); }
-
static int nfs3_ftypes[] = {
0, /* NF3NON */
S_IFREG, /* NF3REG */
@@ -34,7 +32,7 @@ static int nfs3_ftypes[] = {
static __be32
nfsd3_proc_null(struct svc_rqst *rqstp)
{
- return nfs_ok;
+ return rpc_success;
}
/*
@@ -45,20 +43,19 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp)
{
struct nfsd_fhandle *argp = rqstp->rq_argp;
struct nfsd3_attrstat *resp = rqstp->rq_resp;
- __be32 nfserr;
dprintk("nfsd: GETATTR(3) %s\n",
SVCFH_fmt(&argp->fh));
fh_copy(&resp->fh, &argp->fh);
- nfserr = fh_verify(rqstp, &resp->fh, 0,
- NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
- if (nfserr)
- RETURN_STATUS(nfserr);
-
- nfserr = fh_getattr(&resp->fh, &resp->stat);
-
- RETURN_STATUS(nfserr);
+ resp->status = fh_verify(rqstp, &resp->fh, 0,
+ NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
+ if (resp->status != nfs_ok)
+ goto out;
+
+ resp->status = fh_getattr(&resp->fh, &resp->stat);
+out:
+ return rpc_success;
}
/*
@@ -69,15 +66,14 @@ nfsd3_proc_setattr(struct svc_rqst *rqstp)
{
struct nfsd3_sattrargs *argp = rqstp->rq_argp;
struct nfsd3_attrstat *resp = rqstp->rq_resp;
- __be32 nfserr;
dprintk("nfsd: SETATTR(3) %s\n",
SVCFH_fmt(&argp->fh));
fh_copy(&resp->fh, &argp->fh);
- nfserr = nfsd_setattr(rqstp, &resp->fh, &argp->attrs,
- argp->check_guard, argp->guardtime);
- RETURN_STATUS(nfserr);
+ resp->status = nfsd_setattr(rqstp, &resp->fh, &argp->attrs,
+ argp->check_guard, argp->guardtime);
+ return rpc_success;
}
/*
@@ -88,7 +84,6 @@ nfsd3_proc_lookup(struct svc_rqst *rqstp)
{
struct nfsd3_diropargs *argp = rqstp->rq_argp;
struct nfsd3_diropres *resp = rqstp->rq_resp;
- __be32 nfserr;
dprintk("nfsd: LOOKUP(3) %s %.*s\n",
SVCFH_fmt(&argp->fh),
@@ -98,11 +93,10 @@ nfsd3_proc_lookup(struct svc_rqst *rqstp)
fh_copy(&resp->dirfh, &argp->fh);
fh_init(&resp->fh, NFS3_FHSIZE);
- nfserr = nfsd_lookup(rqstp, &resp->dirfh,
- argp->name,
- argp->len,
- &resp->fh);
- RETURN_STATUS(nfserr);
+ resp->status = nfsd_lookup(rqstp, &resp->dirfh,
+ argp->name, argp->len,
+ &resp->fh);
+ return rpc_success;
}
/*
@@ -113,7 +107,6 @@ nfsd3_proc_access(struct svc_rqst *rqstp)
{
struct nfsd3_accessargs *argp = rqstp->rq_argp;
struct nfsd3_accessres *resp = rqstp->rq_resp;
- __be32 nfserr;
dprintk("nfsd: ACCESS(3) %s 0x%x\n",
SVCFH_fmt(&argp->fh),
@@ -121,8 +114,8 @@ nfsd3_proc_access(struct svc_rqst *rqstp)
fh_copy(&resp->fh, &argp->fh);
resp->access = argp->access;
- nfserr = nfsd_access(rqstp, &resp->fh, &resp->access, NULL);
- RETURN_STATUS(nfserr);
+ resp->status = nfsd_access(rqstp, &resp->fh, &resp->access, NULL);
+ return rpc_success;
}
/*
@@ -133,15 +126,14 @@ nfsd3_proc_readlink(struct svc_rqst *rqstp)
{
struct nfsd3_readlinkargs *argp = rqstp->rq_argp;
struct nfsd3_readlinkres *resp = rqstp->rq_resp;
- __be32 nfserr;
dprintk("nfsd: READLINK(3) %s\n", SVCFH_fmt(&argp->fh));
/* Read the symlink. */
fh_copy(&resp->fh, &argp->fh);
resp->len = NFS3_MAXPATHLEN;
- nfserr = nfsd_readlink(rqstp, &resp->fh, argp->buffer, &resp->len);
- RETURN_STATUS(nfserr);
+ resp->status = nfsd_readlink(rqstp, &resp->fh, argp->buffer, &resp->len);
+ return rpc_success;
}
/*
@@ -152,7 +144,6 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
{
struct nfsd3_readargs *argp = rqstp->rq_argp;
struct nfsd3_readres *resp = rqstp->rq_resp;
- __be32 nfserr;
u32 max_blocksize = svc_max_payload(rqstp);
unsigned long cnt = min(argp->count, max_blocksize);
@@ -169,12 +160,10 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
fh_copy(&resp->fh, &argp->fh);
- nfserr = nfsd_read(rqstp, &resp->fh,
- argp->offset,
- rqstp->rq_vec, argp->vlen,
- &resp->count,
- &resp->eof);
- RETURN_STATUS(nfserr);
+ resp->status = nfsd_read(rqstp, &resp->fh, argp->offset,
+ rqstp->rq_vec, argp->vlen, &resp->count,
+ &resp->eof);
+ return rpc_success;
}
/*
@@ -185,7 +174,6 @@ nfsd3_proc_write(struct svc_rqst *rqstp)
{
struct nfsd3_writeargs *argp = rqstp->rq_argp;
struct nfsd3_writeres *resp = rqstp->rq_resp;
- __be32 nfserr;
unsigned long cnt = argp->len;
unsigned int nvecs;
@@ -199,13 +187,16 @@ nfsd3_proc_write(struct svc_rqst *rqstp)
resp->committed = argp->stable;
nvecs = svc_fill_write_vector(rqstp, rqstp->rq_arg.pages,
&argp->first, cnt);
- if (!nvecs)
- RETURN_STATUS(nfserr_io);
- nfserr = nfsd_write(rqstp, &resp->fh, argp->offset,
- rqstp->rq_vec, nvecs, &cnt,
- resp->committed, resp->verf);
+ if (!nvecs) {
+ resp->status = nfserr_io;
+ goto out;
+ }
+ resp->status = nfsd_write(rqstp, &resp->fh, argp->offset,
+ rqstp->rq_vec, nvecs, &cnt,
+ resp->committed, resp->verf);
resp->count = cnt;
- RETURN_STATUS(nfserr);
+out:
+ return rpc_success;
}
/*
@@ -220,7 +211,6 @@ nfsd3_proc_create(struct svc_rqst *rqstp)
struct nfsd3_diropres *resp = rqstp->rq_resp;
svc_fh *dirfhp, *newfhp = NULL;
struct iattr *attr;
- __be32 nfserr;
dprintk("nfsd: CREATE(3) %s %.*s\n",
SVCFH_fmt(&argp->fh),
@@ -241,11 +231,10 @@ nfsd3_proc_create(struct svc_rqst *rqstp)
}
/* Now create the file and set attributes */
- nfserr = do_nfsd_create(rqstp, dirfhp, argp->name, argp->len,
- attr, newfhp,
- argp->createmode, (u32 *)argp->verf, NULL, NULL);
-
- RETURN_STATUS(nfserr);
+ resp->status = do_nfsd_create(rqstp, dirfhp, argp->name, argp->len,
+ attr, newfhp, argp->createmode,
+ (u32 *)argp->verf, NULL, NULL);
+ return rpc_success;
}
/*
@@ -256,7 +245,6 @@ nfsd3_proc_mkdir(struct svc_rqst *rqstp)
{
struct nfsd3_createargs *argp = rqstp->rq_argp;
struct nfsd3_diropres *resp = rqstp->rq_resp;
- __be32 nfserr;
dprintk("nfsd: MKDIR(3) %s %.*s\n",
SVCFH_fmt(&argp->fh),
@@ -266,10 +254,10 @@ nfsd3_proc_mkdir(struct svc_rqst *rqstp)
argp->attrs.ia_valid &= ~ATTR_SIZE;
fh_copy(&resp->dirfh, &argp->fh);
fh_init(&resp->fh, NFS3_FHSIZE);
- nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len,
- &argp->attrs, S_IFDIR, 0, &resp->fh);
+ resp->status = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len,
+ &argp->attrs, S_IFDIR, 0, &resp->fh);
fh_unlock(&resp->dirfh);
- RETURN_STATUS(nfserr);
+ return rpc_success;
}
static __be32
@@ -277,18 +265,23 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp)
{
struct nfsd3_symlinkargs *argp = rqstp->rq_argp;
struct nfsd3_diropres *resp = rqstp->rq_resp;
- __be32 nfserr;
- if (argp->tlen == 0)
- RETURN_STATUS(nfserr_inval);
- if (argp->tlen > NFS3_MAXPATHLEN)
- RETURN_STATUS(nfserr_nametoolong);
+ if (argp->tlen == 0) {
+ resp->status = nfserr_inval;
+ goto out;
+ }
+ if (argp->tlen > NFS3_MAXPATHLEN) {
+ resp->status = nfserr_nametoolong;
+ goto out;
+ }
argp->tname = svc_fill_symlink_pathname(rqstp, &argp->first,
page_address(rqstp->rq_arg.pages[0]),
argp->tlen);
- if (IS_ERR(argp->tname))
- RETURN_STATUS(nfserrno(PTR_ERR(argp->tname)));
+ if (IS_ERR(argp->tname)) {
+ resp->status = nfserrno(PTR_ERR(argp->tname));
+ goto out;
+ }
dprintk("nfsd: SYMLINK(3) %s %.*s -> %.*s\n",
SVCFH_fmt(&argp->ffh),
@@ -297,10 +290,11 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp)
fh_copy(&resp->dirfh, &argp->ffh);
fh_init(&resp->fh, NFS3_FHSIZE);
- nfserr = nfsd_symlink(rqstp, &resp->dirfh, argp->fname, argp->flen,
- argp->tname, &resp->fh);
+ resp->status = nfsd_symlink(rqstp, &resp->dirfh, argp->fname,
+ argp->flen, argp->tname, &resp->fh);
kfree(argp->tname);
- RETURN_STATUS(nfserr);
+out:
+ return rpc_success;
}
/*
@@ -311,7 +305,6 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp)
{
struct nfsd3_mknodargs *argp = rqstp->rq_argp;
struct nfsd3_diropres *resp = rqstp->rq_resp;
- __be32 nfserr;
int type;
dev_t rdev = 0;
@@ -323,22 +316,28 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp)
fh_copy(&resp->dirfh, &argp->fh);
fh_init(&resp->fh, NFS3_FHSIZE);
- if (argp->ftype == 0 || argp->ftype >= NF3BAD)
- RETURN_STATUS(nfserr_inval);
+ if (argp->ftype == 0 || argp->ftype >= NF3BAD) {
+ resp->status = nfserr_inval;
+ goto out;
+ }
if (argp->ftype == NF3CHR || argp->ftype == NF3BLK) {
rdev = MKDEV(argp->major, argp->minor);
if (MAJOR(rdev) != argp->major ||
- MINOR(rdev) != argp->minor)
- RETURN_STATUS(nfserr_inval);
- } else
- if (argp->ftype != NF3SOCK && argp->ftype != NF3FIFO)
- RETURN_STATUS(nfserr_inval);
+ MINOR(rdev) != argp->minor) {
+ resp->status = nfserr_inval;
+ goto out;
+ }
+ } else if (argp->ftype != NF3SOCK && argp->ftype != NF3FIFO) {
+ resp->status = nfserr_inval;
+ goto out;
+ }
type = nfs3_ftypes[argp->ftype];
- nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len,
- &argp->attrs, type, rdev, &resp->fh);
+ resp->status = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len,
+ &argp->attrs, type, rdev, &resp->fh);
fh_unlock(&resp->dirfh);
- RETURN_STATUS(nfserr);
+out:
+ return rpc_success;
}
/*
@@ -349,7 +348,6 @@ nfsd3_proc_remove(struct svc_rqst *rqstp)
{
struct nfsd3_diropargs *argp = rqstp->rq_argp;
struct nfsd3_attrstat *resp = rqstp->rq_resp;
- __be32 nfserr;
dprintk("nfsd: REMOVE(3) %s %.*s\n",
SVCFH_fmt(&argp->fh),
@@ -358,9 +356,10 @@ nfsd3_proc_remove(struct svc_rqst *rqstp)
/* Unlink. -S_IFDIR means file must not be a directory */
fh_copy(&resp->fh, &argp->fh);
- nfserr = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR, argp->name, argp->len);
+ resp->status = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR,
+ argp->name, argp->len);
fh_unlock(&resp->fh);
- RETURN_STATUS(nfserr);
+ return rpc_success;
}
/*
@@ -371,7 +370,6 @@ nfsd3_proc_rmdir(struct svc_rqst *rqstp)
{
struct nfsd3_diropargs *argp = rqstp->rq_argp;
struct nfsd3_attrstat *resp = rqstp->rq_resp;
- __be32 nfserr;
dprintk("nfsd: RMDIR(3) %s %.*s\n",
SVCFH_fmt(&argp->fh),
@@ -379,9 +377,10 @@ nfsd3_proc_rmdir(struct svc_rqst *rqstp)
argp->name);
fh_copy(&resp->fh, &argp->fh);
- nfserr = nfsd_unlink(rqstp, &resp->fh, S_IFDIR, argp->name, argp->len);
+ resp->status = nfsd_unlink(rqstp, &resp->fh, S_IFDIR,
+ argp->name, argp->len);
fh_unlock(&resp->fh);
- RETURN_STATUS(nfserr);
+ return rpc_success;
}
static __be32
@@ -389,7 +388,6 @@ nfsd3_proc_rename(struct svc_rqst *rqstp)
{
struct nfsd3_renameargs *argp = rqstp->rq_argp;
struct nfsd3_renameres *resp = rqstp->rq_resp;
- __be32 nfserr;
dprintk("nfsd: RENAME(3) %s %.*s ->\n",
SVCFH_fmt(&argp->ffh),
@@ -402,9 +400,9 @@ nfsd3_proc_rename(struct svc_rqst *rqstp)
fh_copy(&resp->ffh, &argp->ffh);
fh_copy(&resp->tfh, &argp->tfh);
- nfserr = nfsd_rename(rqstp, &resp->ffh, argp->fname, argp->flen,
- &resp->tfh, argp->tname, argp->tlen);
- RETURN_STATUS(nfserr);
+ resp->status = nfsd_rename(rqstp, &resp->ffh, argp->fname, argp->flen,
+ &resp->tfh, argp->tname, argp->tlen);
+ return rpc_success;
}
static __be32
@@ -412,7 +410,6 @@ nfsd3_proc_link(struct svc_rqst *rqstp)
{
struct nfsd3_linkargs *argp = rqstp->rq_argp;
struct nfsd3_linkres *resp = rqstp->rq_resp;
- __be32 nfserr;
dprintk("nfsd: LINK(3) %s ->\n",
SVCFH_fmt(&argp->ffh));
@@ -423,9 +420,9 @@ nfsd3_proc_link(struct svc_rqst *rqstp)
fh_copy(&resp->fh, &argp->ffh);
fh_copy(&resp->tfh, &argp->tfh);
- nfserr = nfsd_link(rqstp, &resp->tfh, argp->tname, argp->tlen,
- &resp->fh);
- RETURN_STATUS(nfserr);
+ resp->status = nfsd_link(rqstp, &resp->tfh, argp->tname, argp->tlen,
+ &resp->fh);
+ return rpc_success;
}
/*
@@ -436,7 +433,6 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp)
{
struct nfsd3_readdirargs *argp = rqstp->rq_argp;
struct nfsd3_readdirres *resp = rqstp->rq_resp;
- __be32 nfserr;
int count = 0;
struct page **p;
caddr_t page_addr = NULL;
@@ -456,8 +452,8 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp)
resp->common.err = nfs_ok;
resp->buffer = argp->buffer;
resp->rqstp = rqstp;
- nfserr = nfsd_readdir(rqstp, &resp->fh, (loff_t*) &argp->cookie,
- &resp->common, nfs3svc_encode_entry);
+ resp->status = nfsd_readdir(rqstp, &resp->fh, (loff_t *)&argp->cookie,
+ &resp->common, nfs3svc_encode_entry);
memcpy(resp->verf, argp->verf, 8);
count = 0;
for (p = rqstp->rq_respages + 1; p < rqstp->rq_next_page; p++) {
@@ -485,7 +481,7 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp)
resp->offset = NULL;
}
- RETURN_STATUS(nfserr);
+ return rpc_success;
}
/*
@@ -497,7 +493,6 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp)
{
struct nfsd3_readdirargs *argp = rqstp->rq_argp;
struct nfsd3_readdirres *resp = rqstp->rq_resp;
- __be32 nfserr;
int count = 0;
loff_t offset;
struct page **p;
@@ -520,17 +515,17 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp)
resp->rqstp = rqstp;
offset = argp->cookie;
- nfserr = fh_verify(rqstp, &resp->fh, S_IFDIR, NFSD_MAY_NOP);
- if (nfserr)
- RETURN_STATUS(nfserr);
+ resp->status = fh_verify(rqstp, &resp->fh, S_IFDIR, NFSD_MAY_NOP);
+ if (resp->status != nfs_ok)
+ goto out;
- if (resp->fh.fh_export->ex_flags & NFSEXP_NOREADDIRPLUS)
- RETURN_STATUS(nfserr_notsupp);
+ if (resp->fh.fh_export->ex_flags & NFSEXP_NOREADDIRPLUS) {
+ resp->status = nfserr_notsupp;
+ goto out;
+ }
- nfserr = nfsd_readdir(rqstp, &resp->fh,
- &offset,
- &resp->common,
- nfs3svc_encode_entry_plus);
+ resp->status = nfsd_readdir(rqstp, &resp->fh, &offset,
+ &resp->common, nfs3svc_encode_entry_plus);
memcpy(resp->verf, argp->verf, 8);
for (p = rqstp->rq_respages + 1; p < rqstp->rq_next_page; p++) {
page_addr = page_address(*p);
@@ -555,7 +550,8 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp)
resp->offset = NULL;
}
- RETURN_STATUS(nfserr);
+out:
+ return rpc_success;
}
/*
@@ -566,14 +562,13 @@ nfsd3_proc_fsstat(struct svc_rqst *rqstp)
{
struct nfsd_fhandle *argp = rqstp->rq_argp;
struct nfsd3_fsstatres *resp = rqstp->rq_resp;
- __be32 nfserr;
dprintk("nfsd: FSSTAT(3) %s\n",
SVCFH_fmt(&argp->fh));
- nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0);
+ resp->status = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0);
fh_put(&argp->fh);
- RETURN_STATUS(nfserr);
+ return rpc_success;
}
/*
@@ -584,7 +579,6 @@ nfsd3_proc_fsinfo(struct svc_rqst *rqstp)
{
struct nfsd_fhandle *argp = rqstp->rq_argp;
struct nfsd3_fsinfores *resp = rqstp->rq_resp;
- __be32 nfserr;
u32 max_blocksize = svc_max_payload(rqstp);
dprintk("nfsd: FSINFO(3) %s\n",
@@ -600,13 +594,13 @@ nfsd3_proc_fsinfo(struct svc_rqst *rqstp)
resp->f_maxfilesize = ~(u32) 0;
resp->f_properties = NFS3_FSF_DEFAULT;
- nfserr = fh_verify(rqstp, &argp->fh, 0,
- NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
+ resp->status = fh_verify(rqstp, &argp->fh, 0,
+ NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
/* Check special features of the file system. May request
* different read/write sizes for file systems known to have
* problems with large blocks */
- if (nfserr == 0) {
+ if (resp->status == nfs_ok) {
struct super_block *sb = argp->fh.fh_dentry->d_sb;
/* Note that we don't care for remote fs's here */
@@ -617,7 +611,7 @@ nfsd3_proc_fsinfo(struct svc_rqst *rqstp)
}
fh_put(&argp->fh);
- RETURN_STATUS(nfserr);
+ return rpc_success;
}
/*
@@ -628,7 +622,6 @@ nfsd3_proc_pathconf(struct svc_rqst *rqstp)
{
struct nfsd_fhandle *argp = rqstp->rq_argp;
struct nfsd3_pathconfres *resp = rqstp->rq_resp;
- __be32 nfserr;
dprintk("nfsd: PATHCONF(3) %s\n",
SVCFH_fmt(&argp->fh));
@@ -641,9 +634,9 @@ nfsd3_proc_pathconf(struct svc_rqst *rqstp)
resp->p_case_insensitive = 0;
resp->p_case_preserving = 1;
- nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP);
+ resp->status = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP);
- if (nfserr == 0) {
+ if (resp->status == nfs_ok) {
struct super_block *sb = argp->fh.fh_dentry->d_sb;
/* Note that we don't care for remote fs's here */
@@ -660,10 +653,9 @@ nfsd3_proc_pathconf(struct svc_rqst *rqstp)
}
fh_put(&argp->fh);
- RETURN_STATUS(nfserr);
+ return rpc_success;
}
-
/*
* Commit a file (range) to stable storage.
*/
@@ -672,21 +664,22 @@ nfsd3_proc_commit(struct svc_rqst *rqstp)
{
struct nfsd3_commitargs *argp = rqstp->rq_argp;
struct nfsd3_commitres *resp = rqstp->rq_resp;
- __be32 nfserr;
dprintk("nfsd: COMMIT(3) %s %u@%Lu\n",
SVCFH_fmt(&argp->fh),
argp->count,
(unsigned long long) argp->offset);
- if (argp->offset > NFS_OFFSET_MAX)
- RETURN_STATUS(nfserr_inval);
+ if (argp->offset > NFS_OFFSET_MAX) {
+ resp->status = nfserr_inval;
+ goto out;
+ }
fh_copy(&resp->fh, &argp->fh);
- nfserr = nfsd_commit(rqstp, &resp->fh, argp->offset, argp->count,
- resp->verf);
-
- RETURN_STATUS(nfserr);
+ resp->status = nfsd_commit(rqstp, &resp->fh, argp->offset,
+ argp->count, resp->verf);
+out:
+ return rpc_success;
}
@@ -716,6 +709,7 @@ struct nfsd3_voidargs { int dummy; };
static const struct svc_procedure nfsd_procedures3[22] = {
[NFS3PROC_NULL] = {
.pc_func = nfsd3_proc_null,
+ .pc_decode = nfs3svc_decode_voidarg,
.pc_encode = nfs3svc_encode_voidres,
.pc_argsize = sizeof(struct nfsd3_voidargs),
.pc_ressize = sizeof(struct nfsd3_voidres),
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index aae514d40b64..9c23b6acf234 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -305,6 +305,12 @@ void fill_post_wcc(struct svc_fh *fhp)
* XDR decode functions
*/
int
+nfs3svc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p)
+{
+ return 1;
+}
+
+int
nfs3svc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd_fhandle *args = rqstp->rq_argp;
@@ -635,10 +641,7 @@ nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p)
/*
* XDR encode functions
*/
-/*
- * There must be an encoding function for void results so svc_process
- * will work properly.
- */
+
int
nfs3svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p)
{
@@ -651,6 +654,7 @@ nfs3svc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd3_attrstat *resp = rqstp->rq_resp;
+ *p++ = resp->status;
if (resp->status == 0) {
lease_get_mtime(d_inode(resp->fh.fh_dentry),
&resp->stat.mtime);
@@ -665,6 +669,7 @@ nfs3svc_encode_wccstat(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd3_attrstat *resp = rqstp->rq_resp;
+ *p++ = resp->status;
p = encode_wcc_data(rqstp, p, &resp->fh);
return xdr_ressize_check(rqstp, p);
}
@@ -675,6 +680,7 @@ nfs3svc_encode_diropres(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd3_diropres *resp = rqstp->rq_resp;
+ *p++ = resp->status;
if (resp->status == 0) {
p = encode_fh(p, &resp->fh);
p = encode_post_op_attr(rqstp, p, &resp->fh);
@@ -689,6 +695,7 @@ nfs3svc_encode_accessres(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd3_accessres *resp = rqstp->rq_resp;
+ *p++ = resp->status;
p = encode_post_op_attr(rqstp, p, &resp->fh);
if (resp->status == 0)
*p++ = htonl(resp->access);
@@ -701,6 +708,7 @@ nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd3_readlinkres *resp = rqstp->rq_resp;
+ *p++ = resp->status;
p = encode_post_op_attr(rqstp, p, &resp->fh);
if (resp->status == 0) {
*p++ = htonl(resp->len);
@@ -723,6 +731,7 @@ nfs3svc_encode_readres(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd3_readres *resp = rqstp->rq_resp;
+ *p++ = resp->status;
p = encode_post_op_attr(rqstp, p, &resp->fh);
if (resp->status == 0) {
*p++ = htonl(resp->count);
@@ -748,6 +757,7 @@ nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd3_writeres *resp = rqstp->rq_resp;
+ *p++ = resp->status;
p = encode_wcc_data(rqstp, p, &resp->fh);
if (resp->status == 0) {
*p++ = htonl(resp->count);
@@ -764,6 +774,7 @@ nfs3svc_encode_createres(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd3_diropres *resp = rqstp->rq_resp;
+ *p++ = resp->status;
if (resp->status == 0) {
*p++ = xdr_one;
p = encode_fh(p, &resp->fh);
@@ -779,6 +790,7 @@ nfs3svc_encode_renameres(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd3_renameres *resp = rqstp->rq_resp;
+ *p++ = resp->status;
p = encode_wcc_data(rqstp, p, &resp->ffh);
p = encode_wcc_data(rqstp, p, &resp->tfh);
return xdr_ressize_check(rqstp, p);
@@ -790,6 +802,7 @@ nfs3svc_encode_linkres(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd3_linkres *resp = rqstp->rq_resp;
+ *p++ = resp->status;
p = encode_post_op_attr(rqstp, p, &resp->fh);
p = encode_wcc_data(rqstp, p, &resp->tfh);
return xdr_ressize_check(rqstp, p);
@@ -801,6 +814,7 @@ nfs3svc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd3_readdirres *resp = rqstp->rq_resp;
+ *p++ = resp->status;
p = encode_post_op_attr(rqstp, p, &resp->fh);
if (resp->status == 0) {
@@ -1053,6 +1067,7 @@ nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, __be32 *p)
struct kstatfs *s = &resp->stats;
u64 bs = s->f_bsize;
+ *p++ = resp->status;
*p++ = xdr_zero; /* no post_op_attr */
if (resp->status == 0) {
@@ -1073,6 +1088,7 @@ nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd3_fsinfores *resp = rqstp->rq_resp;
+ *p++ = resp->status;
*p++ = xdr_zero; /* no post_op_attr */
if (resp->status == 0) {
@@ -1118,6 +1134,7 @@ nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd3_commitres *resp = rqstp->rq_resp;
+ *p++ = resp->status;
p = encode_wcc_data(rqstp, p, &resp->fh);
/* Write verifier */
if (resp->status == 0) {
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index eaf50eafa935..ad2fa1a8e7ad 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -38,6 +38,7 @@
#include <linux/slab.h>
#include <linux/kthread.h>
#include <linux/sunrpc/addr.h>
+#include <linux/nfs_ssc.h>
#include "idmap.h"
#include "cache.h"
@@ -1247,7 +1248,7 @@ out_err:
static void
nfsd4_interssc_disconnect(struct vfsmount *ss_mnt)
{
- nfs_sb_deactive(ss_mnt->mnt_sb);
+ nfs_do_sb_deactive(ss_mnt->mnt_sb);
mntput(ss_mnt);
}
@@ -2165,7 +2166,7 @@ nfsd4_removexattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
static __be32
nfsd4_proc_null(struct svc_rqst *rqstp)
{
- return nfs_ok;
+ return rpc_success;
}
static inline void nfsd4_increment_op_stats(u32 opnum)
@@ -2457,15 +2458,14 @@ encode_op:
nfsd4_increment_op_stats(op->opnum);
}
- cstate->status = status;
fh_put(current_fh);
fh_put(save_fh);
BUG_ON(cstate->replay_owner);
out:
+ cstate->status = status;
/* Reset deferral mechanism for RPC deferrals */
set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
- dprintk("nfsv4 compound returned %d\n", ntohl(status));
- return status;
+ return rpc_success;
}
#define op_encode_hdr_size (2)
@@ -2591,6 +2591,20 @@ static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32);
}
+static inline u32 nfsd4_read_plus_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ u32 maxcount = svc_max_payload(rqstp);
+ u32 rlen = min(op->u.read.rd_length, maxcount);
+ /*
+ * If we detect that the file changed during hole encoding, then we
+ * recover by encoding the remaining reply as data. This means we need
+ * to set aside enough room to encode two data segments.
+ */
+ u32 seg_len = 2 * (1 + 2 + 1);
+
+ return (op_encode_hdr_size + 2 + seg_len + XDR_QUADLEN(rlen)) * sizeof(__be32);
+}
+
static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
{
u32 maxcount = 0, rlen = 0;
@@ -3163,6 +3177,13 @@ static const struct nfsd4_operation nfsd4_ops[] = {
.op_name = "OP_COPY",
.op_rsize_bop = nfsd4_copy_rsize,
},
+ [OP_READ_PLUS] = {
+ .op_func = nfsd4_read,
+ .op_release = nfsd4_read_release,
+ .op_name = "OP_READ_PLUS",
+ .op_rsize_bop = nfsd4_read_plus_rsize,
+ .op_get_currentstateid = nfsd4_get_readstateid,
+ },
[OP_SEEK] = {
.op_func = nfsd4_seek,
.op_name = "OP_SEEK",
@@ -3231,7 +3252,7 @@ bool nfsd4_spo_must_allow(struct svc_rqst *rqstp)
if (!cstate->minorversion)
return false;
- if (cstate->spo_must_allowed == true)
+ if (cstate->spo_must_allowed)
return true;
opiter = resp->opcnt;
@@ -3279,6 +3300,7 @@ struct nfsd4_voidargs { int dummy; };
static const struct svc_procedure nfsd_procedures4[2] = {
[NFSPROC4_NULL] = {
.pc_func = nfsd4_proc_null,
+ .pc_decode = nfs4svc_decode_voidarg,
.pc_encode = nfs4svc_encode_voidres,
.pc_argsize = sizeof(struct nfsd4_voidargs),
.pc_ressize = sizeof(struct nfsd4_voidres),
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c09a2a4281ec..d7f27ed6b794 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4513,7 +4513,8 @@ static int nfsd4_cb_recall_done(struct nfsd4_callback *cb,
{
struct nfs4_delegation *dp = cb_to_delegation(cb);
- if (dp->dl_stid.sc_type == NFS4_CLOSED_DELEG_STID)
+ if (dp->dl_stid.sc_type == NFS4_CLOSED_DELEG_STID ||
+ dp->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID)
return 1;
switch (task->tk_status) {
@@ -4597,7 +4598,8 @@ static bool nfsd_breaker_owns_lease(struct file_lock *fl)
if (!i_am_nfsd())
return NULL;
rqst = kthread_data(current);
- if (!rqst->rq_lease_breaker)
+ /* Note rq_prog == NFS_ACL_PROGRAM is also possible: */
+ if (rqst->rq_prog != NFS_PROGRAM || rqst->rq_vers < 4)
return NULL;
clp = *(rqst->rq_lease_breaker);
return dl->dl_stid.sc_client == clp;
@@ -4954,7 +4956,6 @@ static int nfsd4_check_conflicting_opens(struct nfs4_client *clp,
writes--;
if (fp->fi_fds[O_RDWR])
writes--;
- WARN_ON_ONCE(writes < 0);
if (writes > 0)
return -EAGAIN;
spin_lock(&fp->fi_lock);
@@ -5126,7 +5127,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open,
memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid));
- trace_nfsd_deleg_open(&dp->dl_stid.sc_stateid);
+ trace_nfsd_deleg_read(&dp->dl_stid.sc_stateid);
open->op_delegate_type = NFS4_OPEN_DELEGATE_READ;
nfs4_put_stid(&dp->dl_stid);
return;
@@ -5243,7 +5244,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
nfs4_open_delegation(current_fh, open, stp);
nodeleg:
status = nfs_ok;
- trace_nfsd_deleg_none(&stp->st_stid.sc_stateid);
+ trace_nfsd_open(&stp->st_stid.sc_stateid);
out:
/* 4.1 client trying to upgrade/downgrade delegation? */
if (open->op_delegate_type == NFS4_OPEN_DELEGATE_NONE && dp &&
@@ -5722,7 +5723,6 @@ nfs4_find_file(struct nfs4_stid *s, int flags)
return find_readable_file(s->sc_file);
else
return find_writeable_file(s->sc_file);
- break;
}
return NULL;
@@ -7253,599 +7253,6 @@ nfs4_check_open_reclaim(clientid_t *clid,
return nfs_ok;
}
-#ifdef CONFIG_NFSD_FAULT_INJECTION
-static inline void
-put_client(struct nfs4_client *clp)
-{
- atomic_dec(&clp->cl_rpc_users);
-}
-
-static struct nfs4_client *
-nfsd_find_client(struct sockaddr_storage *addr, size_t addr_size)
-{
- struct nfs4_client *clp;
- struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
- nfsd_net_id);
-
- if (!nfsd_netns_ready(nn))
- return NULL;
-
- list_for_each_entry(clp, &nn->client_lru, cl_lru) {
- if (memcmp(&clp->cl_addr, addr, addr_size) == 0)
- return clp;
- }
- return NULL;
-}
-
-u64
-nfsd_inject_print_clients(void)
-{
- struct nfs4_client *clp;
- u64 count = 0;
- struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
- nfsd_net_id);
- char buf[INET6_ADDRSTRLEN];
-
- if (!nfsd_netns_ready(nn))
- return 0;
-
- spin_lock(&nn->client_lock);
- list_for_each_entry(clp, &nn->client_lru, cl_lru) {
- rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf));
- pr_info("NFS Client: %s\n", buf);
- ++count;
- }
- spin_unlock(&nn->client_lock);
-
- return count;
-}
-
-u64
-nfsd_inject_forget_client(struct sockaddr_storage *addr, size_t addr_size)
-{
- u64 count = 0;
- struct nfs4_client *clp;
- struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
- nfsd_net_id);
-
- if (!nfsd_netns_ready(nn))
- return count;
-
- spin_lock(&nn->client_lock);
- clp = nfsd_find_client(addr, addr_size);
- if (clp) {
- if (mark_client_expired_locked(clp) == nfs_ok)
- ++count;
- else
- clp = NULL;
- }
- spin_unlock(&nn->client_lock);
-
- if (clp)
- expire_client(clp);
-
- return count;
-}
-
-u64
-nfsd_inject_forget_clients(u64 max)
-{
- u64 count = 0;
- struct nfs4_client *clp, *next;
- struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
- nfsd_net_id);
- LIST_HEAD(reaplist);
-
- if (!nfsd_netns_ready(nn))
- return count;
-
- spin_lock(&nn->client_lock);
- list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) {
- if (mark_client_expired_locked(clp) == nfs_ok) {
- list_add(&clp->cl_lru, &reaplist);
- if (max != 0 && ++count >= max)
- break;
- }
- }
- spin_unlock(&nn->client_lock);
-
- list_for_each_entry_safe(clp, next, &reaplist, cl_lru)
- expire_client(clp);
-
- return count;
-}
-
-static void nfsd_print_count(struct nfs4_client *clp, unsigned int count,
- const char *type)
-{
- char buf[INET6_ADDRSTRLEN];
- rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf));
- printk(KERN_INFO "NFS Client: %s has %u %s\n", buf, count, type);
-}
-
-static void
-nfsd_inject_add_lock_to_list(struct nfs4_ol_stateid *lst,
- struct list_head *collect)
-{
- struct nfs4_client *clp = lst->st_stid.sc_client;
- struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
- nfsd_net_id);
-
- if (!collect)
- return;
-
- lockdep_assert_held(&nn->client_lock);
- atomic_inc(&clp->cl_rpc_users);
- list_add(&lst->st_locks, collect);
-}
-
-static u64 nfsd_foreach_client_lock(struct nfs4_client *clp, u64 max,
- struct list_head *collect,
- bool (*func)(struct nfs4_ol_stateid *))
-{
- struct nfs4_openowner *oop;
- struct nfs4_ol_stateid *stp, *st_next;
- struct nfs4_ol_stateid *lst, *lst_next;
- u64 count = 0;
-
- spin_lock(&clp->cl_lock);
- list_for_each_entry(oop, &clp->cl_openowners, oo_perclient) {
- list_for_each_entry_safe(stp, st_next,
- &oop->oo_owner.so_stateids, st_perstateowner) {
- list_for_each_entry_safe(lst, lst_next,
- &stp->st_locks, st_locks) {
- if (func) {
- if (func(lst))
- nfsd_inject_add_lock_to_list(lst,
- collect);
- }
- ++count;
- /*
- * Despite the fact that these functions deal
- * with 64-bit integers for "count", we must
- * ensure that it doesn't blow up the
- * clp->cl_rpc_users. Throw a warning if we
- * start to approach INT_MAX here.
- */
- WARN_ON_ONCE(count == (INT_MAX / 2));
- if (count == max)
- goto out;
- }
- }
- }
-out:
- spin_unlock(&clp->cl_lock);
-
- return count;
-}
-
-static u64
-nfsd_collect_client_locks(struct nfs4_client *clp, struct list_head *collect,
- u64 max)
-{
- return nfsd_foreach_client_lock(clp, max, collect, unhash_lock_stateid);
-}
-
-static u64
-nfsd_print_client_locks(struct nfs4_client *clp)
-{
- u64 count = nfsd_foreach_client_lock(clp, 0, NULL, NULL);
- nfsd_print_count(clp, count, "locked files");
- return count;
-}
-
-u64
-nfsd_inject_print_locks(void)
-{
- struct nfs4_client *clp;
- u64 count = 0;
- struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
- nfsd_net_id);
-
- if (!nfsd_netns_ready(nn))
- return 0;
-
- spin_lock(&nn->client_lock);
- list_for_each_entry(clp, &nn->client_lru, cl_lru)
- count += nfsd_print_client_locks(clp);
- spin_unlock(&nn->client_lock);
-
- return count;
-}
-
-static void
-nfsd_reap_locks(struct list_head *reaplist)
-{
- struct nfs4_client *clp;
- struct nfs4_ol_stateid *stp, *next;
-
- list_for_each_entry_safe(stp, next, reaplist, st_locks) {
- list_del_init(&stp->st_locks);
- clp = stp->st_stid.sc_client;
- nfs4_put_stid(&stp->st_stid);
- put_client(clp);
- }
-}
-
-u64
-nfsd_inject_forget_client_locks(struct sockaddr_storage *addr, size_t addr_size)
-{
- unsigned int count = 0;
- struct nfs4_client *clp;
- struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
- nfsd_net_id);
- LIST_HEAD(reaplist);
-
- if (!nfsd_netns_ready(nn))
- return count;
-
- spin_lock(&nn->client_lock);
- clp = nfsd_find_client(addr, addr_size);
- if (clp)
- count = nfsd_collect_client_locks(clp, &reaplist, 0);
- spin_unlock(&nn->client_lock);
- nfsd_reap_locks(&reaplist);
- return count;
-}
-
-u64
-nfsd_inject_forget_locks(u64 max)
-{
- u64 count = 0;
- struct nfs4_client *clp;
- struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
- nfsd_net_id);
- LIST_HEAD(reaplist);
-
- if (!nfsd_netns_ready(nn))
- return count;
-
- spin_lock(&nn->client_lock);
- list_for_each_entry(clp, &nn->client_lru, cl_lru) {
- count += nfsd_collect_client_locks(clp, &reaplist, max - count);
- if (max != 0 && count >= max)
- break;
- }
- spin_unlock(&nn->client_lock);
- nfsd_reap_locks(&reaplist);
- return count;
-}
-
-static u64
-nfsd_foreach_client_openowner(struct nfs4_client *clp, u64 max,
- struct list_head *collect,
- void (*func)(struct nfs4_openowner *))
-{
- struct nfs4_openowner *oop, *next;
- struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
- nfsd_net_id);
- u64 count = 0;
-
- lockdep_assert_held(&nn->client_lock);
-
- spin_lock(&clp->cl_lock);
- list_for_each_entry_safe(oop, next, &clp->cl_openowners, oo_perclient) {
- if (func) {
- func(oop);
- if (collect) {
- atomic_inc(&clp->cl_rpc_users);
- list_add(&oop->oo_perclient, collect);
- }
- }
- ++count;
- /*
- * Despite the fact that these functions deal with
- * 64-bit integers for "count", we must ensure that
- * it doesn't blow up the clp->cl_rpc_users. Throw a
- * warning if we start to approach INT_MAX here.
- */
- WARN_ON_ONCE(count == (INT_MAX / 2));
- if (count == max)
- break;
- }
- spin_unlock(&clp->cl_lock);
-
- return count;
-}
-
-static u64
-nfsd_print_client_openowners(struct nfs4_client *clp)
-{
- u64 count = nfsd_foreach_client_openowner(clp, 0, NULL, NULL);
-
- nfsd_print_count(clp, count, "openowners");
- return count;
-}
-
-static u64
-nfsd_collect_client_openowners(struct nfs4_client *clp,
- struct list_head *collect, u64 max)
-{
- return nfsd_foreach_client_openowner(clp, max, collect,
- unhash_openowner_locked);
-}
-
-u64
-nfsd_inject_print_openowners(void)
-{
- struct nfs4_client *clp;
- u64 count = 0;
- struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
- nfsd_net_id);
-
- if (!nfsd_netns_ready(nn))
- return 0;
-
- spin_lock(&nn->client_lock);
- list_for_each_entry(clp, &nn->client_lru, cl_lru)
- count += nfsd_print_client_openowners(clp);
- spin_unlock(&nn->client_lock);
-
- return count;
-}
-
-static void
-nfsd_reap_openowners(struct list_head *reaplist)
-{
- struct nfs4_client *clp;
- struct nfs4_openowner *oop, *next;
-
- list_for_each_entry_safe(oop, next, reaplist, oo_perclient) {
- list_del_init(&oop->oo_perclient);
- clp = oop->oo_owner.so_client;
- release_openowner(oop);
- put_client(clp);
- }
-}
-
-u64
-nfsd_inject_forget_client_openowners(struct sockaddr_storage *addr,
- size_t addr_size)
-{
- unsigned int count = 0;
- struct nfs4_client *clp;
- struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
- nfsd_net_id);
- LIST_HEAD(reaplist);
-
- if (!nfsd_netns_ready(nn))
- return count;
-
- spin_lock(&nn->client_lock);
- clp = nfsd_find_client(addr, addr_size);
- if (clp)
- count = nfsd_collect_client_openowners(clp, &reaplist, 0);
- spin_unlock(&nn->client_lock);
- nfsd_reap_openowners(&reaplist);
- return count;
-}
-
-u64
-nfsd_inject_forget_openowners(u64 max)
-{
- u64 count = 0;
- struct nfs4_client *clp;
- struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
- nfsd_net_id);
- LIST_HEAD(reaplist);
-
- if (!nfsd_netns_ready(nn))
- return count;
-
- spin_lock(&nn->client_lock);
- list_for_each_entry(clp, &nn->client_lru, cl_lru) {
- count += nfsd_collect_client_openowners(clp, &reaplist,
- max - count);
- if (max != 0 && count >= max)
- break;
- }
- spin_unlock(&nn->client_lock);
- nfsd_reap_openowners(&reaplist);
- return count;
-}
-
-static u64 nfsd_find_all_delegations(struct nfs4_client *clp, u64 max,
- struct list_head *victims)
-{
- struct nfs4_delegation *dp, *next;
- struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
- nfsd_net_id);
- u64 count = 0;
-
- lockdep_assert_held(&nn->client_lock);
-
- spin_lock(&state_lock);
- list_for_each_entry_safe(dp, next, &clp->cl_delegations, dl_perclnt) {
- if (victims) {
- /*
- * It's not safe to mess with delegations that have a
- * non-zero dl_time. They might have already been broken
- * and could be processed by the laundromat outside of
- * the state_lock. Just leave them be.
- */
- if (dp->dl_time != 0)
- continue;
-
- atomic_inc(&clp->cl_rpc_users);
- WARN_ON(!unhash_delegation_locked(dp));
- list_add(&dp->dl_recall_lru, victims);
- }
- ++count;
- /*
- * Despite the fact that these functions deal with
- * 64-bit integers for "count", we must ensure that
- * it doesn't blow up the clp->cl_rpc_users. Throw a
- * warning if we start to approach INT_MAX here.
- */
- WARN_ON_ONCE(count == (INT_MAX / 2));
- if (count == max)
- break;
- }
- spin_unlock(&state_lock);
- return count;
-}
-
-static u64
-nfsd_print_client_delegations(struct nfs4_client *clp)
-{
- u64 count = nfsd_find_all_delegations(clp, 0, NULL);
-
- nfsd_print_count(clp, count, "delegations");
- return count;
-}
-
-u64
-nfsd_inject_print_delegations(void)
-{
- struct nfs4_client *clp;
- u64 count = 0;
- struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
- nfsd_net_id);
-
- if (!nfsd_netns_ready(nn))
- return 0;
-
- spin_lock(&nn->client_lock);
- list_for_each_entry(clp, &nn->client_lru, cl_lru)
- count += nfsd_print_client_delegations(clp);
- spin_unlock(&nn->client_lock);
-
- return count;
-}
-
-static void
-nfsd_forget_delegations(struct list_head *reaplist)
-{
- struct nfs4_client *clp;
- struct nfs4_delegation *dp, *next;
-
- list_for_each_entry_safe(dp, next, reaplist, dl_recall_lru) {
- list_del_init(&dp->dl_recall_lru);
- clp = dp->dl_stid.sc_client;
- revoke_delegation(dp);
- put_client(clp);
- }
-}
-
-u64
-nfsd_inject_forget_client_delegations(struct sockaddr_storage *addr,
- size_t addr_size)
-{
- u64 count = 0;
- struct nfs4_client *clp;
- struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
- nfsd_net_id);
- LIST_HEAD(reaplist);
-
- if (!nfsd_netns_ready(nn))
- return count;
-
- spin_lock(&nn->client_lock);
- clp = nfsd_find_client(addr, addr_size);
- if (clp)
- count = nfsd_find_all_delegations(clp, 0, &reaplist);
- spin_unlock(&nn->client_lock);
-
- nfsd_forget_delegations(&reaplist);
- return count;
-}
-
-u64
-nfsd_inject_forget_delegations(u64 max)
-{
- u64 count = 0;
- struct nfs4_client *clp;
- struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
- nfsd_net_id);
- LIST_HEAD(reaplist);
-
- if (!nfsd_netns_ready(nn))
- return count;
-
- spin_lock(&nn->client_lock);
- list_for_each_entry(clp, &nn->client_lru, cl_lru) {
- count += nfsd_find_all_delegations(clp, max - count, &reaplist);
- if (max != 0 && count >= max)
- break;
- }
- spin_unlock(&nn->client_lock);
- nfsd_forget_delegations(&reaplist);
- return count;
-}
-
-static void
-nfsd_recall_delegations(struct list_head *reaplist)
-{
- struct nfs4_client *clp;
- struct nfs4_delegation *dp, *next;
-
- list_for_each_entry_safe(dp, next, reaplist, dl_recall_lru) {
- list_del_init(&dp->dl_recall_lru);
- clp = dp->dl_stid.sc_client;
-
- trace_nfsd_deleg_recall(&dp->dl_stid.sc_stateid);
-
- /*
- * We skipped all entries that had a zero dl_time before,
- * so we can now reset the dl_time back to 0. If a delegation
- * break comes in now, then it won't make any difference since
- * we're recalling it either way.
- */
- spin_lock(&state_lock);
- dp->dl_time = 0;
- spin_unlock(&state_lock);
- nfsd_break_one_deleg(dp);
- put_client(clp);
- }
-}
-
-u64
-nfsd_inject_recall_client_delegations(struct sockaddr_storage *addr,
- size_t addr_size)
-{
- u64 count = 0;
- struct nfs4_client *clp;
- struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
- nfsd_net_id);
- LIST_HEAD(reaplist);
-
- if (!nfsd_netns_ready(nn))
- return count;
-
- spin_lock(&nn->client_lock);
- clp = nfsd_find_client(addr, addr_size);
- if (clp)
- count = nfsd_find_all_delegations(clp, 0, &reaplist);
- spin_unlock(&nn->client_lock);
-
- nfsd_recall_delegations(&reaplist);
- return count;
-}
-
-u64
-nfsd_inject_recall_delegations(u64 max)
-{
- u64 count = 0;
- struct nfs4_client *clp, *next;
- struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
- nfsd_net_id);
- LIST_HEAD(reaplist);
-
- if (!nfsd_netns_ready(nn))
- return count;
-
- spin_lock(&nn->client_lock);
- list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) {
- count += nfsd_find_all_delegations(clp, max - count, &reaplist);
- if (max != 0 && ++count >= max)
- break;
- }
- spin_unlock(&nn->client_lock);
- nfsd_recall_delegations(&reaplist);
- return count;
-}
-#endif /* CONFIG_NFSD_FAULT_INJECTION */
-
/*
* Since the lifetime of a delegation isn't limited to that of an open, a
* client may quite reasonably hang on to a delegation as long as it has
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 259d5ad0e3f4..833a2c64dfe8 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1855,7 +1855,7 @@ static __be32
nfsd4_decode_copy_notify(struct nfsd4_compoundargs *argp,
struct nfsd4_copy_notify *cn)
{
- int status;
+ __be32 status;
status = nfsd4_decode_stateid(argp, &cn->cpn_src_stateid);
if (status)
@@ -2173,7 +2173,7 @@ static const nfsd4_dec nfsd4_dec_ops[] = {
[OP_LAYOUTSTATS] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_offload_status,
[OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_offload_status,
- [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_read,
[OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek,
[OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_CLONE] = (nfsd4_dec)nfsd4_decode_clone,
@@ -2261,7 +2261,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
*/
cachethis |= nfsd4_cache_this_op(op);
- if (op->opnum == OP_READ) {
+ if (op->opnum == OP_READ || op->opnum == OP_READ_PLUS) {
readcount++;
readbytes += nfsd4_max_reply(argp->rqstp, op);
} else
@@ -3814,36 +3814,14 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
{
struct xdr_stream *xdr = &resp->xdr;
u32 eof;
- int v;
int starting_len = xdr->buf->len - 8;
- long len;
- int thislen;
__be32 nfserr;
__be32 tmp;
- __be32 *p;
int pad;
- /*
- * svcrdma requires every READ payload to start somewhere
- * in xdr->pages.
- */
- if (xdr->iov == xdr->buf->head) {
- xdr->iov = NULL;
- xdr->end = xdr->p;
- }
-
- len = maxcount;
- v = 0;
- while (len) {
- thislen = min_t(long, len, PAGE_SIZE);
- p = xdr_reserve_space(xdr, thislen);
- WARN_ON_ONCE(!p);
- resp->rqstp->rq_vec[v].iov_base = p;
- resp->rqstp->rq_vec[v].iov_len = thislen;
- v++;
- len -= thislen;
- }
- read->rd_vlen = v;
+ read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, maxcount);
+ if (read->rd_vlen < 0)
+ return nfserr_resource;
nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
resp->rqstp->rq_vec, read->rd_vlen, &maxcount,
@@ -4619,6 +4597,149 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
return nfserr_resource;
p = xdr_encode_hyper(p, os->count);
*p++ = cpu_to_be32(0);
+ return nfserr;
+}
+
+static __be32
+nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
+ struct nfsd4_read *read,
+ unsigned long *maxcount, u32 *eof,
+ loff_t *pos)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ struct file *file = read->rd_nf->nf_file;
+ int starting_len = xdr->buf->len;
+ loff_t hole_pos;
+ __be32 nfserr;
+ __be32 *p, tmp;
+ __be64 tmp64;
+
+ hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset, SEEK_HOLE);
+ if (hole_pos > read->rd_offset)
+ *maxcount = min_t(unsigned long, *maxcount, hole_pos - read->rd_offset);
+ *maxcount = min_t(unsigned long, *maxcount, (xdr->buf->buflen - xdr->buf->len));
+
+ /* Content type, offset, byte count */
+ p = xdr_reserve_space(xdr, 4 + 8 + 4);
+ if (!p)
+ return nfserr_resource;
+
+ read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, *maxcount);
+ if (read->rd_vlen < 0)
+ return nfserr_resource;
+
+ nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
+ resp->rqstp->rq_vec, read->rd_vlen, maxcount, eof);
+ if (nfserr)
+ return nfserr;
+
+ tmp = htonl(NFS4_CONTENT_DATA);
+ write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4);
+ tmp64 = cpu_to_be64(read->rd_offset);
+ write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp64, 8);
+ tmp = htonl(*maxcount);
+ write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp, 4);
+ return nfs_ok;
+}
+
+static __be32
+nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp,
+ struct nfsd4_read *read,
+ unsigned long *maxcount, u32 *eof)
+{
+ struct file *file = read->rd_nf->nf_file;
+ loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA);
+ loff_t f_size = i_size_read(file_inode(file));
+ unsigned long count;
+ __be32 *p;
+
+ if (data_pos == -ENXIO)
+ data_pos = f_size;
+ else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE))
+ return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size);
+ count = data_pos - read->rd_offset;
+
+ /* Content type, offset, byte count */
+ p = xdr_reserve_space(&resp->xdr, 4 + 8 + 8);
+ if (!p)
+ return nfserr_resource;
+
+ *p++ = htonl(NFS4_CONTENT_HOLE);
+ p = xdr_encode_hyper(p, read->rd_offset);
+ p = xdr_encode_hyper(p, count);
+
+ *eof = (read->rd_offset + count) >= f_size;
+ *maxcount = min_t(unsigned long, count, *maxcount);
+ return nfs_ok;
+}
+
+static __be32
+nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_read *read)
+{
+ unsigned long maxcount, count;
+ struct xdr_stream *xdr = &resp->xdr;
+ struct file *file;
+ int starting_len = xdr->buf->len;
+ int last_segment = xdr->buf->len;
+ int segments = 0;
+ __be32 *p, tmp;
+ bool is_data;
+ loff_t pos;
+ u32 eof;
+
+ if (nfserr)
+ return nfserr;
+ file = read->rd_nf->nf_file;
+
+ /* eof flag, segment count */
+ p = xdr_reserve_space(xdr, 4 + 4);
+ if (!p)
+ return nfserr_resource;
+ xdr_commit_encode(xdr);
+
+ maxcount = svc_max_payload(resp->rqstp);
+ maxcount = min_t(unsigned long, maxcount,
+ (xdr->buf->buflen - xdr->buf->len));
+ maxcount = min_t(unsigned long, maxcount, read->rd_length);
+ count = maxcount;
+
+ eof = read->rd_offset >= i_size_read(file_inode(file));
+ if (eof)
+ goto out;
+
+ pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
+ is_data = pos > read->rd_offset;
+
+ while (count > 0 && !eof) {
+ maxcount = count;
+ if (is_data)
+ nfserr = nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof,
+ segments == 0 ? &pos : NULL);
+ else
+ nfserr = nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof);
+ if (nfserr)
+ goto out;
+ count -= maxcount;
+ read->rd_offset += maxcount;
+ is_data = !is_data;
+ last_segment = xdr->buf->len;
+ segments++;
+ }
+
+out:
+ if (nfserr && segments == 0)
+ xdr_truncate_encode(xdr, starting_len);
+ else {
+ tmp = htonl(eof);
+ write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4);
+ tmp = htonl(segments);
+ write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
+ if (nfserr) {
+ xdr_truncate_encode(xdr, last_segment);
+ nfserr = nfs_ok;
+ }
+ }
return nfserr;
}
@@ -4679,7 +4800,7 @@ nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
/*
* Encode kmalloc-ed buffer in to XDR stream.
*/
-static int
+static __be32
nfsd4_vbuf_to_stream(struct xdr_stream *xdr, char *buf, u32 buflen)
{
u32 cplen;
@@ -4795,7 +4916,7 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr,
u32 xdrlen, offset;
u64 cookie;
char *sp;
- __be32 status;
+ __be32 status, tmp;
__be32 *p;
u32 nuser;
@@ -4828,7 +4949,7 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr,
slen = strlen(sp);
/*
- * Check if this a user. attribute, skip it if not.
+ * Check if this is a "user." attribute, skip it if not.
*/
if (strncmp(sp, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
goto contloop;
@@ -4859,7 +4980,7 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr,
goto out;
}
- p = xdr_encode_opaque(p, sp, slen);
+ xdr_encode_opaque(p, sp, slen);
xdrleft -= xdrlen;
count++;
@@ -4888,8 +5009,8 @@ wreof:
cookie = offset + count;
write_bytes_to_xdr_buf(xdr->buf, cookie_offset, &cookie, 8);
- count = htonl(count);
- write_bytes_to_xdr_buf(xdr->buf, count_offset, &count, 4);
+ tmp = cpu_to_be32(count);
+ write_bytes_to_xdr_buf(xdr->buf, count_offset, &tmp, 4);
out:
if (listxattrs->lsxa_len)
kvfree(listxattrs->lsxa_buf);
@@ -4996,7 +5117,7 @@ static const nfsd4_enc nfsd4_enc_ops[] = {
[OP_LAYOUTSTATS] = (nfsd4_enc)nfsd4_encode_noop,
[OP_OFFLOAD_CANCEL] = (nfsd4_enc)nfsd4_encode_noop,
[OP_OFFLOAD_STATUS] = (nfsd4_enc)nfsd4_encode_offload_status,
- [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_read_plus,
[OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek,
[OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop,
[OP_CLONE] = (nfsd4_enc)nfsd4_encode_noop,
@@ -5157,6 +5278,12 @@ void nfsd4_release_compoundargs(struct svc_rqst *rqstp)
}
int
+nfs4svc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p)
+{
+ return 1;
+}
+
+int
nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd4_compoundargs *args = rqstp->rq_argp;
@@ -5183,15 +5310,14 @@ nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p)
int
nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p)
{
- /*
- * All that remains is to write the tag and operation count...
- */
struct nfsd4_compoundres *resp = rqstp->rq_resp;
struct xdr_buf *buf = resp->xdr.buf;
WARN_ON_ONCE(buf->len != buf->head[0].iov_len + buf->page_len +
buf->tail[0].iov_len);
+ *p = resp->cstate.status;
+
rqstp->rq_next_page = resp->xdr.page_ptr + 1;
p = resp->tagp;
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 0a0cf1fd77d3..80c90fc231a5 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -172,14 +172,10 @@ int nfsd_reply_cache_init(struct nfsd_net *nn)
if (status)
goto out_nomem;
- nn->drc_hashtbl = kcalloc(hashsize,
- sizeof(*nn->drc_hashtbl), GFP_KERNEL);
- if (!nn->drc_hashtbl) {
- nn->drc_hashtbl = vzalloc(array_size(hashsize,
- sizeof(*nn->drc_hashtbl)));
- if (!nn->drc_hashtbl)
- goto out_shrinker;
- }
+ nn->drc_hashtbl = kvzalloc(array_size(hashsize,
+ sizeof(*nn->drc_hashtbl)), GFP_KERNEL);
+ if (!nn->drc_hashtbl)
+ goto out_shrinker;
for (i = 0; i < hashsize; i++) {
INIT_LIST_HEAD(&nn->drc_hashtbl[i].lru_head);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 7ae236113040..f6d5d783f4a4 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1534,7 +1534,6 @@ static int __init init_nfsd(void)
retval = nfsd4_init_pnfs();
if (retval)
goto out_free_slabs;
- nfsd_fault_inject_init(); /* nfsd fault injection controls */
nfsd_stat_init(); /* Statistics */
retval = nfsd_drc_slab_create();
if (retval)
@@ -1555,7 +1554,6 @@ out_free_lockd:
nfsd_drc_slab_free();
out_free_stat:
nfsd_stat_shutdown();
- nfsd_fault_inject_cleanup();
nfsd4_exit_pnfs();
out_free_slabs:
nfsd4_free_slabs();
@@ -1575,7 +1573,6 @@ static void __exit exit_nfsd(void)
nfsd_lockd_shutdown();
nfsd4_free_slabs();
nfsd4_exit_pnfs();
- nfsd_fault_inject_cleanup();
unregister_filesystem(&nfsd_fs_type);
unregister_cld_notifier();
unregister_pernet_subsys(&nfsd_net_ops);
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 6e0b066480c5..0d71549f9d42 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -11,30 +11,14 @@
#include "xdr.h"
#include "vfs.h"
-typedef struct svc_rqst svc_rqst;
-typedef struct svc_buf svc_buf;
-
#define NFSDDBG_FACILITY NFSDDBG_PROC
-
static __be32
nfsd_proc_null(struct svc_rqst *rqstp)
{
- return nfs_ok;
+ return rpc_success;
}
-static __be32
-nfsd_return_attrs(__be32 err, struct nfsd_attrstat *resp)
-{
- if (err) return err;
- return fh_getattr(&resp->fh, &resp->stat);
-}
-static __be32
-nfsd_return_dirop(__be32 err, struct nfsd_diropres *resp)
-{
- if (err) return err;
- return fh_getattr(&resp->fh, &resp->stat);
-}
/*
* Get a file's attributes
* N.B. After this call resp->fh needs an fh_put
@@ -44,13 +28,17 @@ nfsd_proc_getattr(struct svc_rqst *rqstp)
{
struct nfsd_fhandle *argp = rqstp->rq_argp;
struct nfsd_attrstat *resp = rqstp->rq_resp;
- __be32 nfserr;
+
dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh));
fh_copy(&resp->fh, &argp->fh);
- nfserr = fh_verify(rqstp, &resp->fh, 0,
- NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
- return nfsd_return_attrs(nfserr, resp);
+ resp->status = fh_verify(rqstp, &resp->fh, 0,
+ NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
+ if (resp->status != nfs_ok)
+ goto out;
+ resp->status = fh_getattr(&resp->fh, &resp->stat);
+out:
+ return rpc_success;
}
/*
@@ -64,7 +52,6 @@ nfsd_proc_setattr(struct svc_rqst *rqstp)
struct nfsd_attrstat *resp = rqstp->rq_resp;
struct iattr *iap = &argp->attrs;
struct svc_fh *fhp;
- __be32 nfserr;
dprintk("nfsd: SETATTR %s, valid=%x, size=%ld\n",
SVCFH_fmt(&argp->fh),
@@ -96,9 +83,9 @@ nfsd_proc_setattr(struct svc_rqst *rqstp)
*/
time64_t delta = iap->ia_atime.tv_sec - ktime_get_real_seconds();
- nfserr = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP);
- if (nfserr)
- goto done;
+ resp->status = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP);
+ if (resp->status != nfs_ok)
+ goto out;
if (delta < 0)
delta = -delta;
@@ -113,9 +100,20 @@ nfsd_proc_setattr(struct svc_rqst *rqstp)
}
}
- nfserr = nfsd_setattr(rqstp, fhp, iap, 0, (time64_t)0);
-done:
- return nfsd_return_attrs(nfserr, resp);
+ resp->status = nfsd_setattr(rqstp, fhp, iap, 0, (time64_t)0);
+ if (resp->status != nfs_ok)
+ goto out;
+
+ resp->status = fh_getattr(&resp->fh, &resp->stat);
+out:
+ return rpc_success;
+}
+
+/* Obsolete, replaced by MNTPROC_MNT. */
+static __be32
+nfsd_proc_root(struct svc_rqst *rqstp)
+{
+ return rpc_success;
}
/*
@@ -129,17 +127,20 @@ nfsd_proc_lookup(struct svc_rqst *rqstp)
{
struct nfsd_diropargs *argp = rqstp->rq_argp;
struct nfsd_diropres *resp = rqstp->rq_resp;
- __be32 nfserr;
dprintk("nfsd: LOOKUP %s %.*s\n",
SVCFH_fmt(&argp->fh), argp->len, argp->name);
fh_init(&resp->fh, NFS_FHSIZE);
- nfserr = nfsd_lookup(rqstp, &argp->fh, argp->name, argp->len,
- &resp->fh);
-
+ resp->status = nfsd_lookup(rqstp, &argp->fh, argp->name, argp->len,
+ &resp->fh);
fh_put(&argp->fh);
- return nfsd_return_dirop(nfserr, resp);
+ if (resp->status != nfs_ok)
+ goto out;
+
+ resp->status = fh_getattr(&resp->fh, &resp->stat);
+out:
+ return rpc_success;
}
/*
@@ -150,16 +151,15 @@ nfsd_proc_readlink(struct svc_rqst *rqstp)
{
struct nfsd_readlinkargs *argp = rqstp->rq_argp;
struct nfsd_readlinkres *resp = rqstp->rq_resp;
- __be32 nfserr;
dprintk("nfsd: READLINK %s\n", SVCFH_fmt(&argp->fh));
/* Read the symlink. */
resp->len = NFS_MAXPATHLEN;
- nfserr = nfsd_readlink(rqstp, &argp->fh, argp->buffer, &resp->len);
+ resp->status = nfsd_readlink(rqstp, &argp->fh, argp->buffer, &resp->len);
fh_put(&argp->fh);
- return nfserr;
+ return rpc_success;
}
/*
@@ -171,7 +171,6 @@ nfsd_proc_read(struct svc_rqst *rqstp)
{
struct nfsd_readargs *argp = rqstp->rq_argp;
struct nfsd_readres *resp = rqstp->rq_resp;
- __be32 nfserr;
u32 eof;
dprintk("nfsd: READ %s %d bytes at %d\n",
@@ -193,14 +192,23 @@ nfsd_proc_read(struct svc_rqst *rqstp)
svc_reserve_auth(rqstp, (19<<2) + argp->count + 4);
resp->count = argp->count;
- nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh),
- argp->offset,
- rqstp->rq_vec, argp->vlen,
- &resp->count,
- &eof);
-
- if (nfserr) return nfserr;
- return fh_getattr(&resp->fh, &resp->stat);
+ resp->status = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh),
+ argp->offset,
+ rqstp->rq_vec, argp->vlen,
+ &resp->count,
+ &eof);
+ if (resp->status == nfs_ok)
+ resp->status = fh_getattr(&resp->fh, &resp->stat);
+ else if (resp->status == nfserr_jukebox)
+ return rpc_drop_reply;
+ return rpc_success;
+}
+
+/* Reserved */
+static __be32
+nfsd_proc_writecache(struct svc_rqst *rqstp)
+{
+ return rpc_success;
}
/*
@@ -212,7 +220,6 @@ nfsd_proc_write(struct svc_rqst *rqstp)
{
struct nfsd_writeargs *argp = rqstp->rq_argp;
struct nfsd_attrstat *resp = rqstp->rq_resp;
- __be32 nfserr;
unsigned long cnt = argp->len;
unsigned int nvecs;
@@ -222,12 +229,20 @@ nfsd_proc_write(struct svc_rqst *rqstp)
nvecs = svc_fill_write_vector(rqstp, rqstp->rq_arg.pages,
&argp->first, cnt);
- if (!nvecs)
- return nfserr_io;
- nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh),
- argp->offset, rqstp->rq_vec, nvecs,
- &cnt, NFS_DATA_SYNC, NULL);
- return nfsd_return_attrs(nfserr, resp);
+ if (!nvecs) {
+ resp->status = nfserr_io;
+ goto out;
+ }
+
+ resp->status = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh),
+ argp->offset, rqstp->rq_vec, nvecs,
+ &cnt, NFS_DATA_SYNC, NULL);
+ if (resp->status == nfs_ok)
+ resp->status = fh_getattr(&resp->fh, &resp->stat);
+ else if (resp->status == nfserr_jukebox)
+ return rpc_drop_reply;
+out:
+ return rpc_success;
}
/*
@@ -247,7 +262,6 @@ nfsd_proc_create(struct svc_rqst *rqstp)
struct inode *inode;
struct dentry *dchild;
int type, mode;
- __be32 nfserr;
int hosterr;
dev_t rdev = 0, wanted = new_decode_dev(attr->ia_size);
@@ -255,40 +269,40 @@ nfsd_proc_create(struct svc_rqst *rqstp)
SVCFH_fmt(dirfhp), argp->len, argp->name);
/* First verify the parent file handle */
- nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_EXEC);
- if (nfserr)
+ resp->status = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_EXEC);
+ if (resp->status != nfs_ok)
goto done; /* must fh_put dirfhp even on error */
/* Check for NFSD_MAY_WRITE in nfsd_create if necessary */
- nfserr = nfserr_exist;
+ resp->status = nfserr_exist;
if (isdotent(argp->name, argp->len))
goto done;
hosterr = fh_want_write(dirfhp);
if (hosterr) {
- nfserr = nfserrno(hosterr);
+ resp->status = nfserrno(hosterr);
goto done;
}
fh_lock_nested(dirfhp, I_MUTEX_PARENT);
dchild = lookup_one_len(argp->name, dirfhp->fh_dentry, argp->len);
if (IS_ERR(dchild)) {
- nfserr = nfserrno(PTR_ERR(dchild));
+ resp->status = nfserrno(PTR_ERR(dchild));
goto out_unlock;
}
fh_init(newfhp, NFS_FHSIZE);
- nfserr = fh_compose(newfhp, dirfhp->fh_export, dchild, dirfhp);
- if (!nfserr && d_really_is_negative(dchild))
- nfserr = nfserr_noent;
+ resp->status = fh_compose(newfhp, dirfhp->fh_export, dchild, dirfhp);
+ if (!resp->status && d_really_is_negative(dchild))
+ resp->status = nfserr_noent;
dput(dchild);
- if (nfserr) {
- if (nfserr != nfserr_noent)
+ if (resp->status) {
+ if (resp->status != nfserr_noent)
goto out_unlock;
/*
* If the new file handle wasn't verified, we can't tell
* whether the file exists or not. Time to bail ...
*/
- nfserr = nfserr_acces;
+ resp->status = nfserr_acces;
if (!newfhp->fh_dentry) {
printk(KERN_WARNING
"nfsd_proc_create: file handle not verified\n");
@@ -321,11 +335,11 @@ nfsd_proc_create(struct svc_rqst *rqstp)
* echo thing > device-special-file-or-pipe
* by doing a CREATE with type==0
*/
- nfserr = nfsd_permission(rqstp,
+ resp->status = nfsd_permission(rqstp,
newfhp->fh_export,
newfhp->fh_dentry,
NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS);
- if (nfserr && nfserr != nfserr_rofs)
+ if (resp->status && resp->status != nfserr_rofs)
goto out_unlock;
}
} else
@@ -361,16 +375,17 @@ nfsd_proc_create(struct svc_rqst *rqstp)
attr->ia_valid &= ~ATTR_SIZE;
/* Make sure the type and device matches */
- nfserr = nfserr_exist;
+ resp->status = nfserr_exist;
if (inode && type != (inode->i_mode & S_IFMT))
goto out_unlock;
}
- nfserr = 0;
+ resp->status = nfs_ok;
if (!inode) {
/* File doesn't exist. Create it and set attrs */
- nfserr = nfsd_create_locked(rqstp, dirfhp, argp->name,
- argp->len, attr, type, rdev, newfhp);
+ resp->status = nfsd_create_locked(rqstp, dirfhp, argp->name,
+ argp->len, attr, type, rdev,
+ newfhp);
} else if (type == S_IFREG) {
dprintk("nfsd: existing %s, valid=%x, size=%ld\n",
argp->name, attr->ia_valid, (long) attr->ia_size);
@@ -380,7 +395,8 @@ nfsd_proc_create(struct svc_rqst *rqstp)
*/
attr->ia_valid &= ATTR_SIZE;
if (attr->ia_valid)
- nfserr = nfsd_setattr(rqstp, newfhp, attr, 0, (time64_t)0);
+ resp->status = nfsd_setattr(rqstp, newfhp, attr, 0,
+ (time64_t)0);
}
out_unlock:
@@ -389,47 +405,52 @@ out_unlock:
fh_drop_write(dirfhp);
done:
fh_put(dirfhp);
- return nfsd_return_dirop(nfserr, resp);
+ if (resp->status != nfs_ok)
+ goto out;
+ resp->status = fh_getattr(&resp->fh, &resp->stat);
+out:
+ return rpc_success;
}
static __be32
nfsd_proc_remove(struct svc_rqst *rqstp)
{
struct nfsd_diropargs *argp = rqstp->rq_argp;
- __be32 nfserr;
+ struct nfsd_stat *resp = rqstp->rq_resp;
dprintk("nfsd: REMOVE %s %.*s\n", SVCFH_fmt(&argp->fh),
argp->len, argp->name);
/* Unlink. -SIFDIR means file must not be a directory */
- nfserr = nfsd_unlink(rqstp, &argp->fh, -S_IFDIR, argp->name, argp->len);
+ resp->status = nfsd_unlink(rqstp, &argp->fh, -S_IFDIR,
+ argp->name, argp->len);
fh_put(&argp->fh);
- return nfserr;
+ return rpc_success;
}
static __be32
nfsd_proc_rename(struct svc_rqst *rqstp)
{
struct nfsd_renameargs *argp = rqstp->rq_argp;
- __be32 nfserr;
+ struct nfsd_stat *resp = rqstp->rq_resp;
dprintk("nfsd: RENAME %s %.*s -> \n",
SVCFH_fmt(&argp->ffh), argp->flen, argp->fname);
dprintk("nfsd: -> %s %.*s\n",
SVCFH_fmt(&argp->tfh), argp->tlen, argp->tname);
- nfserr = nfsd_rename(rqstp, &argp->ffh, argp->fname, argp->flen,
- &argp->tfh, argp->tname, argp->tlen);
+ resp->status = nfsd_rename(rqstp, &argp->ffh, argp->fname, argp->flen,
+ &argp->tfh, argp->tname, argp->tlen);
fh_put(&argp->ffh);
fh_put(&argp->tfh);
- return nfserr;
+ return rpc_success;
}
static __be32
nfsd_proc_link(struct svc_rqst *rqstp)
{
struct nfsd_linkargs *argp = rqstp->rq_argp;
- __be32 nfserr;
+ struct nfsd_stat *resp = rqstp->rq_resp;
dprintk("nfsd: LINK %s ->\n",
SVCFH_fmt(&argp->ffh));
@@ -438,41 +459,46 @@ nfsd_proc_link(struct svc_rqst *rqstp)
argp->tlen,
argp->tname);
- nfserr = nfsd_link(rqstp, &argp->tfh, argp->tname, argp->tlen,
- &argp->ffh);
+ resp->status = nfsd_link(rqstp, &argp->tfh, argp->tname, argp->tlen,
+ &argp->ffh);
fh_put(&argp->ffh);
fh_put(&argp->tfh);
- return nfserr;
+ return rpc_success;
}
static __be32
nfsd_proc_symlink(struct svc_rqst *rqstp)
{
struct nfsd_symlinkargs *argp = rqstp->rq_argp;
+ struct nfsd_stat *resp = rqstp->rq_resp;
struct svc_fh newfh;
- __be32 nfserr;
- if (argp->tlen > NFS_MAXPATHLEN)
- return nfserr_nametoolong;
+ if (argp->tlen > NFS_MAXPATHLEN) {
+ resp->status = nfserr_nametoolong;
+ goto out;
+ }
argp->tname = svc_fill_symlink_pathname(rqstp, &argp->first,
page_address(rqstp->rq_arg.pages[0]),
argp->tlen);
- if (IS_ERR(argp->tname))
- return nfserrno(PTR_ERR(argp->tname));
+ if (IS_ERR(argp->tname)) {
+ resp->status = nfserrno(PTR_ERR(argp->tname));
+ goto out;
+ }
dprintk("nfsd: SYMLINK %s %.*s -> %.*s\n",
SVCFH_fmt(&argp->ffh), argp->flen, argp->fname,
argp->tlen, argp->tname);
fh_init(&newfh, NFS_FHSIZE);
- nfserr = nfsd_symlink(rqstp, &argp->ffh, argp->fname, argp->flen,
- argp->tname, &newfh);
+ resp->status = nfsd_symlink(rqstp, &argp->ffh, argp->fname, argp->flen,
+ argp->tname, &newfh);
kfree(argp->tname);
fh_put(&argp->ffh);
fh_put(&newfh);
- return nfserr;
+out:
+ return rpc_success;
}
/*
@@ -484,7 +510,6 @@ nfsd_proc_mkdir(struct svc_rqst *rqstp)
{
struct nfsd_createargs *argp = rqstp->rq_argp;
struct nfsd_diropres *resp = rqstp->rq_resp;
- __be32 nfserr;
dprintk("nfsd: MKDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name);
@@ -495,10 +520,15 @@ nfsd_proc_mkdir(struct svc_rqst *rqstp)
argp->attrs.ia_valid &= ~ATTR_SIZE;
fh_init(&resp->fh, NFS_FHSIZE);
- nfserr = nfsd_create(rqstp, &argp->fh, argp->name, argp->len,
- &argp->attrs, S_IFDIR, 0, &resp->fh);
+ resp->status = nfsd_create(rqstp, &argp->fh, argp->name, argp->len,
+ &argp->attrs, S_IFDIR, 0, &resp->fh);
fh_put(&argp->fh);
- return nfsd_return_dirop(nfserr, resp);
+ if (resp->status != nfs_ok)
+ goto out;
+
+ resp->status = fh_getattr(&resp->fh, &resp->stat);
+out:
+ return rpc_success;
}
/*
@@ -508,13 +538,14 @@ static __be32
nfsd_proc_rmdir(struct svc_rqst *rqstp)
{
struct nfsd_diropargs *argp = rqstp->rq_argp;
- __be32 nfserr;
+ struct nfsd_stat *resp = rqstp->rq_resp;
dprintk("nfsd: RMDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name);
- nfserr = nfsd_unlink(rqstp, &argp->fh, S_IFDIR, argp->name, argp->len);
+ resp->status = nfsd_unlink(rqstp, &argp->fh, S_IFDIR,
+ argp->name, argp->len);
fh_put(&argp->fh);
- return nfserr;
+ return rpc_success;
}
/*
@@ -526,7 +557,6 @@ nfsd_proc_readdir(struct svc_rqst *rqstp)
struct nfsd_readdirargs *argp = rqstp->rq_argp;
struct nfsd_readdirres *resp = rqstp->rq_resp;
int count;
- __be32 nfserr;
loff_t offset;
dprintk("nfsd: READDIR %s %d bytes at %d\n",
@@ -547,15 +577,15 @@ nfsd_proc_readdir(struct svc_rqst *rqstp)
resp->common.err = nfs_ok;
/* Read directory and encode entries on the fly */
offset = argp->cookie;
- nfserr = nfsd_readdir(rqstp, &argp->fh, &offset,
- &resp->common, nfssvc_encode_entry);
+ resp->status = nfsd_readdir(rqstp, &argp->fh, &offset,
+ &resp->common, nfssvc_encode_entry);
resp->count = resp->buffer - argp->buffer;
if (resp->offset)
*resp->offset = htonl(offset);
fh_put(&argp->fh);
- return nfserr;
+ return rpc_success;
}
/*
@@ -566,14 +596,13 @@ nfsd_proc_statfs(struct svc_rqst *rqstp)
{
struct nfsd_fhandle *argp = rqstp->rq_argp;
struct nfsd_statfsres *resp = rqstp->rq_resp;
- __be32 nfserr;
dprintk("nfsd: STATFS %s\n", SVCFH_fmt(&argp->fh));
- nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats,
- NFSD_MAY_BYPASS_GSS_ON_ROOT);
+ resp->status = nfsd_statfs(rqstp, &argp->fh, &resp->stats,
+ NFSD_MAY_BYPASS_GSS_ON_ROOT);
fh_put(&argp->fh);
- return nfserr;
+ return rpc_success;
}
/*
@@ -594,13 +623,13 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_argsize = sizeof(struct nfsd_void),
.pc_ressize = sizeof(struct nfsd_void),
.pc_cachetype = RC_NOCACHE,
- .pc_xdrressize = ST,
+ .pc_xdrressize = 0,
},
[NFSPROC_GETATTR] = {
.pc_func = nfsd_proc_getattr,
.pc_decode = nfssvc_decode_fhandle,
.pc_encode = nfssvc_encode_attrstat,
- .pc_release = nfssvc_release_fhandle,
+ .pc_release = nfssvc_release_attrstat,
.pc_argsize = sizeof(struct nfsd_fhandle),
.pc_ressize = sizeof(struct nfsd_attrstat),
.pc_cachetype = RC_NOCACHE,
@@ -610,25 +639,26 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_func = nfsd_proc_setattr,
.pc_decode = nfssvc_decode_sattrargs,
.pc_encode = nfssvc_encode_attrstat,
- .pc_release = nfssvc_release_fhandle,
+ .pc_release = nfssvc_release_attrstat,
.pc_argsize = sizeof(struct nfsd_sattrargs),
.pc_ressize = sizeof(struct nfsd_attrstat),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+AT,
},
[NFSPROC_ROOT] = {
+ .pc_func = nfsd_proc_root,
.pc_decode = nfssvc_decode_void,
.pc_encode = nfssvc_encode_void,
.pc_argsize = sizeof(struct nfsd_void),
.pc_ressize = sizeof(struct nfsd_void),
.pc_cachetype = RC_NOCACHE,
- .pc_xdrressize = ST,
+ .pc_xdrressize = 0,
},
[NFSPROC_LOOKUP] = {
.pc_func = nfsd_proc_lookup,
.pc_decode = nfssvc_decode_diropargs,
.pc_encode = nfssvc_encode_diropres,
- .pc_release = nfssvc_release_fhandle,
+ .pc_release = nfssvc_release_diropres,
.pc_argsize = sizeof(struct nfsd_diropargs),
.pc_ressize = sizeof(struct nfsd_diropres),
.pc_cachetype = RC_NOCACHE,
@@ -647,25 +677,26 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_func = nfsd_proc_read,
.pc_decode = nfssvc_decode_readargs,
.pc_encode = nfssvc_encode_readres,
- .pc_release = nfssvc_release_fhandle,
+ .pc_release = nfssvc_release_readres,
.pc_argsize = sizeof(struct nfsd_readargs),
.pc_ressize = sizeof(struct nfsd_readres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4,
},
[NFSPROC_WRITECACHE] = {
+ .pc_func = nfsd_proc_writecache,
.pc_decode = nfssvc_decode_void,
.pc_encode = nfssvc_encode_void,
.pc_argsize = sizeof(struct nfsd_void),
.pc_ressize = sizeof(struct nfsd_void),
.pc_cachetype = RC_NOCACHE,
- .pc_xdrressize = ST,
+ .pc_xdrressize = 0,
},
[NFSPROC_WRITE] = {
.pc_func = nfsd_proc_write,
.pc_decode = nfssvc_decode_writeargs,
.pc_encode = nfssvc_encode_attrstat,
- .pc_release = nfssvc_release_fhandle,
+ .pc_release = nfssvc_release_attrstat,
.pc_argsize = sizeof(struct nfsd_writeargs),
.pc_ressize = sizeof(struct nfsd_attrstat),
.pc_cachetype = RC_REPLBUFF,
@@ -675,7 +706,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_func = nfsd_proc_create,
.pc_decode = nfssvc_decode_createargs,
.pc_encode = nfssvc_encode_diropres,
- .pc_release = nfssvc_release_fhandle,
+ .pc_release = nfssvc_release_diropres,
.pc_argsize = sizeof(struct nfsd_createargs),
.pc_ressize = sizeof(struct nfsd_diropres),
.pc_cachetype = RC_REPLBUFF,
@@ -684,36 +715,36 @@ static const struct svc_procedure nfsd_procedures2[18] = {
[NFSPROC_REMOVE] = {
.pc_func = nfsd_proc_remove,
.pc_decode = nfssvc_decode_diropargs,
- .pc_encode = nfssvc_encode_void,
+ .pc_encode = nfssvc_encode_stat,
.pc_argsize = sizeof(struct nfsd_diropargs),
- .pc_ressize = sizeof(struct nfsd_void),
+ .pc_ressize = sizeof(struct nfsd_stat),
.pc_cachetype = RC_REPLSTAT,
.pc_xdrressize = ST,
},
[NFSPROC_RENAME] = {
.pc_func = nfsd_proc_rename,
.pc_decode = nfssvc_decode_renameargs,
- .pc_encode = nfssvc_encode_void,
+ .pc_encode = nfssvc_encode_stat,
.pc_argsize = sizeof(struct nfsd_renameargs),
- .pc_ressize = sizeof(struct nfsd_void),
+ .pc_ressize = sizeof(struct nfsd_stat),
.pc_cachetype = RC_REPLSTAT,
.pc_xdrressize = ST,
},
[NFSPROC_LINK] = {
.pc_func = nfsd_proc_link,
.pc_decode = nfssvc_decode_linkargs,
- .pc_encode = nfssvc_encode_void,
+ .pc_encode = nfssvc_encode_stat,
.pc_argsize = sizeof(struct nfsd_linkargs),
- .pc_ressize = sizeof(struct nfsd_void),
+ .pc_ressize = sizeof(struct nfsd_stat),
.pc_cachetype = RC_REPLSTAT,
.pc_xdrressize = ST,
},
[NFSPROC_SYMLINK] = {
.pc_func = nfsd_proc_symlink,
.pc_decode = nfssvc_decode_symlinkargs,
- .pc_encode = nfssvc_encode_void,
+ .pc_encode = nfssvc_encode_stat,
.pc_argsize = sizeof(struct nfsd_symlinkargs),
- .pc_ressize = sizeof(struct nfsd_void),
+ .pc_ressize = sizeof(struct nfsd_stat),
.pc_cachetype = RC_REPLSTAT,
.pc_xdrressize = ST,
},
@@ -721,7 +752,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_func = nfsd_proc_mkdir,
.pc_decode = nfssvc_decode_createargs,
.pc_encode = nfssvc_encode_diropres,
- .pc_release = nfssvc_release_fhandle,
+ .pc_release = nfssvc_release_diropres,
.pc_argsize = sizeof(struct nfsd_createargs),
.pc_ressize = sizeof(struct nfsd_diropres),
.pc_cachetype = RC_REPLBUFF,
@@ -730,9 +761,9 @@ static const struct svc_procedure nfsd_procedures2[18] = {
[NFSPROC_RMDIR] = {
.pc_func = nfsd_proc_rmdir,
.pc_decode = nfssvc_decode_diropargs,
- .pc_encode = nfssvc_encode_void,
+ .pc_encode = nfssvc_encode_stat,
.pc_argsize = sizeof(struct nfsd_diropargs),
- .pc_ressize = sizeof(struct nfsd_void),
+ .pc_ressize = sizeof(struct nfsd_stat),
.pc_cachetype = RC_REPLSTAT,
.pc_xdrressize = ST,
},
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index f7f6473578af..27b1ad136150 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -960,15 +960,6 @@ out:
return 0;
}
-static __be32 map_new_errors(u32 vers, __be32 nfserr)
-{
- if (nfserr == nfserr_jukebox && vers == 2)
- return nfserr_dropit;
- if (nfserr == nfserr_wrongsec && vers < 4)
- return nfserr_acces;
- return nfserr;
-}
-
/*
* A write procedure can have a large argument, and a read procedure can
* have a large reply, but no NFSv2 or NFSv3 procedure has argument and
@@ -1000,80 +991,85 @@ static bool nfs_request_too_big(struct svc_rqst *rqstp,
return rqstp->rq_arg.len > PAGE_SIZE;
}
-int
-nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
+/**
+ * nfsd_dispatch - Process an NFS or NFSACL Request
+ * @rqstp: incoming request
+ * @statp: pointer to location of accept_stat field in RPC Reply buffer
+ *
+ * This RPC dispatcher integrates the NFS server's duplicate reply cache.
+ *
+ * Return values:
+ * %0: Processing complete; do not send a Reply
+ * %1: Processing complete; send Reply in rqstp->rq_res
+ */
+int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
{
- const struct svc_procedure *proc;
- __be32 nfserr;
- __be32 *nfserrp;
+ const struct svc_procedure *proc = rqstp->rq_procinfo;
+ struct kvec *argv = &rqstp->rq_arg.head[0];
+ struct kvec *resv = &rqstp->rq_res.head[0];
+ __be32 *p;
dprintk("nfsd_dispatch: vers %d proc %d\n",
rqstp->rq_vers, rqstp->rq_proc);
- proc = rqstp->rq_procinfo;
- if (nfs_request_too_big(rqstp, proc)) {
- dprintk("nfsd: NFSv%d argument too large\n", rqstp->rq_vers);
- *statp = rpc_garbage_args;
- return 1;
- }
- rqstp->rq_lease_breaker = NULL;
+ if (nfs_request_too_big(rqstp, proc))
+ goto out_too_large;
+
/*
* Give the xdr decoder a chance to change this if it wants
* (necessary in the NFSv4.0 compound case)
*/
rqstp->rq_cachetype = proc->pc_cachetype;
- /* Decode arguments */
- if (proc->pc_decode &&
- !proc->pc_decode(rqstp, (__be32*)rqstp->rq_arg.head[0].iov_base)) {
- dprintk("nfsd: failed to decode arguments!\n");
- *statp = rpc_garbage_args;
- return 1;
- }
+ if (!proc->pc_decode(rqstp, argv->iov_base))
+ goto out_decode_err;
- /* Check whether we have this call in the cache. */
switch (nfsd_cache_lookup(rqstp)) {
- case RC_DROPIT:
- return 0;
+ case RC_DOIT:
+ break;
case RC_REPLY:
- return 1;
- case RC_DOIT:;
- /* do it */
+ goto out_cached_reply;
+ case RC_DROPIT:
+ goto out_dropit;
}
- /* need to grab the location to store the status, as
- * nfsv4 does some encoding while processing
+ /*
+ * Need to grab the location to store the status, as
+ * NFSv4 does some encoding while processing
*/
- nfserrp = rqstp->rq_res.head[0].iov_base
- + rqstp->rq_res.head[0].iov_len;
- rqstp->rq_res.head[0].iov_len += sizeof(__be32);
-
- /* Now call the procedure handler, and encode NFS status. */
- nfserr = proc->pc_func(rqstp);
- nfserr = map_new_errors(rqstp->rq_vers, nfserr);
- if (nfserr == nfserr_dropit || test_bit(RQ_DROPME, &rqstp->rq_flags)) {
- dprintk("nfsd: Dropping request; may be revisited later\n");
- nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
- return 0;
- }
+ p = resv->iov_base + resv->iov_len;
+ resv->iov_len += sizeof(__be32);
- if (rqstp->rq_proc != 0)
- *nfserrp++ = nfserr;
+ *statp = proc->pc_func(rqstp);
+ if (*statp == rpc_drop_reply || test_bit(RQ_DROPME, &rqstp->rq_flags))
+ goto out_update_drop;
- /* Encode result.
- * For NFSv2, additional info is never returned in case of an error.
- */
- if (!(nfserr && rqstp->rq_vers == 2)) {
- if (proc->pc_encode && !proc->pc_encode(rqstp, nfserrp)) {
- /* Failed to encode result. Release cache entry */
- dprintk("nfsd: failed to encode result!\n");
- nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
- *statp = rpc_system_err;
- return 1;
- }
- }
+ if (!proc->pc_encode(rqstp, p))
+ goto out_encode_err;
- /* Store reply in cache. */
nfsd_cache_update(rqstp, rqstp->rq_cachetype, statp + 1);
+out_cached_reply:
+ return 1;
+
+out_too_large:
+ dprintk("nfsd: NFSv%d argument too large\n", rqstp->rq_vers);
+ *statp = rpc_garbage_args;
+ return 1;
+
+out_decode_err:
+ dprintk("nfsd: failed to decode arguments!\n");
+ *statp = rpc_garbage_args;
+ return 1;
+
+out_update_drop:
+ dprintk("nfsd: Dropping request; may be revisited later\n");
+ nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
+out_dropit:
+ return 0;
+
+out_encode_err:
+ dprintk("nfsd: failed to encode result!\n");
+ nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
+ *statp = rpc_system_err;
return 1;
}
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index b51fe515f06f..8a288c8fcd57 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -430,11 +430,24 @@ nfssvc_encode_void(struct svc_rqst *rqstp, __be32 *p)
}
int
+nfssvc_encode_stat(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd_stat *resp = rqstp->rq_resp;
+
+ *p++ = resp->status;
+ return xdr_ressize_check(rqstp, p);
+}
+
+int
nfssvc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd_attrstat *resp = rqstp->rq_resp;
+ *p++ = resp->status;
+ if (resp->status != nfs_ok)
+ goto out;
p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
+out:
return xdr_ressize_check(rqstp, p);
}
@@ -443,8 +456,12 @@ nfssvc_encode_diropres(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd_diropres *resp = rqstp->rq_resp;
+ *p++ = resp->status;
+ if (resp->status != nfs_ok)
+ goto out;
p = encode_fh(p, &resp->fh);
p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
+out:
return xdr_ressize_check(rqstp, p);
}
@@ -453,6 +470,10 @@ nfssvc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd_readlinkres *resp = rqstp->rq_resp;
+ *p++ = resp->status;
+ if (resp->status != nfs_ok)
+ return xdr_ressize_check(rqstp, p);
+
*p++ = htonl(resp->len);
xdr_ressize_check(rqstp, p);
rqstp->rq_res.page_len = resp->len;
@@ -470,6 +491,10 @@ nfssvc_encode_readres(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd_readres *resp = rqstp->rq_resp;
+ *p++ = resp->status;
+ if (resp->status != nfs_ok)
+ return xdr_ressize_check(rqstp, p);
+
p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
*p++ = htonl(resp->count);
xdr_ressize_check(rqstp, p);
@@ -490,6 +515,10 @@ nfssvc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd_readdirres *resp = rqstp->rq_resp;
+ *p++ = resp->status;
+ if (resp->status != nfs_ok)
+ return xdr_ressize_check(rqstp, p);
+
xdr_ressize_check(rqstp, p);
p = resp->buffer;
*p++ = 0; /* no more entries */
@@ -505,6 +534,10 @@ nfssvc_encode_statfsres(struct svc_rqst *rqstp, __be32 *p)
struct nfsd_statfsres *resp = rqstp->rq_resp;
struct kstatfs *stat = &resp->stats;
+ *p++ = resp->status;
+ if (resp->status != nfs_ok)
+ return xdr_ressize_check(rqstp, p);
+
*p++ = htonl(NFSSVC_MAXBLKSIZE_V2); /* max transfer size */
*p++ = htonl(stat->f_bsize);
*p++ = htonl(stat->f_blocks);
@@ -561,10 +594,23 @@ nfssvc_encode_entry(void *ccdv, const char *name,
/*
* XDR release functions
*/
-void
-nfssvc_release_fhandle(struct svc_rqst *rqstp)
+void nfssvc_release_attrstat(struct svc_rqst *rqstp)
{
- struct nfsd_fhandle *resp = rqstp->rq_resp;
+ struct nfsd_attrstat *resp = rqstp->rq_resp;
+
+ fh_put(&resp->fh);
+}
+
+void nfssvc_release_diropres(struct svc_rqst *rqstp)
+{
+ struct nfsd_diropres *resp = rqstp->rq_resp;
+
+ fh_put(&resp->fh);
+}
+
+void nfssvc_release_readres(struct svc_rqst *rqstp)
+{
+ struct nfsd_readres *resp = rqstp->rq_resp;
fh_put(&resp->fh);
}
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 3b408532a5dc..9eae11a9d21c 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -693,31 +693,4 @@ extern void nfsd4_client_record_remove(struct nfs4_client *clp);
extern int nfsd4_client_record_check(struct nfs4_client *clp);
extern void nfsd4_record_grace_done(struct nfsd_net *nn);
-/* nfs fault injection functions */
-#ifdef CONFIG_NFSD_FAULT_INJECTION
-void nfsd_fault_inject_init(void);
-void nfsd_fault_inject_cleanup(void);
-
-u64 nfsd_inject_print_clients(void);
-u64 nfsd_inject_forget_client(struct sockaddr_storage *, size_t);
-u64 nfsd_inject_forget_clients(u64);
-
-u64 nfsd_inject_print_locks(void);
-u64 nfsd_inject_forget_client_locks(struct sockaddr_storage *, size_t);
-u64 nfsd_inject_forget_locks(u64);
-
-u64 nfsd_inject_print_openowners(void);
-u64 nfsd_inject_forget_client_openowners(struct sockaddr_storage *, size_t);
-u64 nfsd_inject_forget_openowners(u64);
-
-u64 nfsd_inject_print_delegations(void);
-u64 nfsd_inject_forget_client_delegations(struct sockaddr_storage *, size_t);
-u64 nfsd_inject_forget_delegations(u64);
-u64 nfsd_inject_recall_client_delegations(struct sockaddr_storage *, size_t);
-u64 nfsd_inject_recall_delegations(u64);
-#else /* CONFIG_NFSD_FAULT_INJECTION */
-static inline void nfsd_fault_inject_init(void) {}
-static inline void nfsd_fault_inject_cleanup(void) {}
-#endif /* CONFIG_NFSD_FAULT_INJECTION */
-
#endif /* NFSD4_STATE_H */
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 1861db1bdc67..99bf07800cd0 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -289,8 +289,8 @@ DEFINE_STATEID_EVENT(layout_recall_done);
DEFINE_STATEID_EVENT(layout_recall_fail);
DEFINE_STATEID_EVENT(layout_recall_release);
-DEFINE_STATEID_EVENT(deleg_open);
-DEFINE_STATEID_EVENT(deleg_none);
+DEFINE_STATEID_EVENT(open);
+DEFINE_STATEID_EVENT(deleg_read);
DEFINE_STATEID_EVENT(deleg_break);
DEFINE_STATEID_EVENT(deleg_recall);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index aba5af9df328..1ecaceebee13 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -2259,7 +2259,8 @@ out:
__be32
nfsd_removexattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name)
{
- int err, ret;
+ __be32 err;
+ int ret;
err = fh_verify(rqstp, fhp, 0, NFSD_MAY_WRITE);
if (err)
@@ -2283,7 +2284,8 @@ __be32
nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name,
void *buf, u32 len, u32 flags)
{
- int err, ret;
+ __be32 err;
+ int ret;
err = fh_verify(rqstp, fhp, 0, NFSD_MAY_WRITE);
if (err)
diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h
index ea7cca3a64b7..0ff336b0b25f 100644
--- a/fs/nfsd/xdr.h
+++ b/fs/nfsd/xdr.h
@@ -82,27 +82,37 @@ struct nfsd_readdirargs {
__be32 * buffer;
};
+struct nfsd_stat {
+ __be32 status;
+};
+
struct nfsd_attrstat {
+ __be32 status;
struct svc_fh fh;
struct kstat stat;
};
struct nfsd_diropres {
+ __be32 status;
struct svc_fh fh;
struct kstat stat;
};
struct nfsd_readlinkres {
+ __be32 status;
int len;
};
struct nfsd_readres {
+ __be32 status;
struct svc_fh fh;
unsigned long count;
struct kstat stat;
};
struct nfsd_readdirres {
+ __be32 status;
+
int count;
struct readdir_cd common;
@@ -112,6 +122,7 @@ struct nfsd_readdirres {
};
struct nfsd_statfsres {
+ __be32 status;
struct kstatfs stats;
};
@@ -146,6 +157,7 @@ int nfssvc_decode_linkargs(struct svc_rqst *, __be32 *);
int nfssvc_decode_symlinkargs(struct svc_rqst *, __be32 *);
int nfssvc_decode_readdirargs(struct svc_rqst *, __be32 *);
int nfssvc_encode_void(struct svc_rqst *, __be32 *);
+int nfssvc_encode_stat(struct svc_rqst *, __be32 *);
int nfssvc_encode_attrstat(struct svc_rqst *, __be32 *);
int nfssvc_encode_diropres(struct svc_rqst *, __be32 *);
int nfssvc_encode_readlinkres(struct svc_rqst *, __be32 *);
@@ -156,7 +168,9 @@ int nfssvc_encode_readdirres(struct svc_rqst *, __be32 *);
int nfssvc_encode_entry(void *, const char *name,
int namlen, loff_t offset, u64 ino, unsigned int);
-void nfssvc_release_fhandle(struct svc_rqst *);
+void nfssvc_release_attrstat(struct svc_rqst *rqstp);
+void nfssvc_release_diropres(struct svc_rqst *rqstp);
+void nfssvc_release_readres(struct svc_rqst *rqstp);
/* Helper functions for NFSv2 ACL code */
__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat);
diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
index 4155fd71714c..ae6fa6c9cb46 100644
--- a/fs/nfsd/xdr3.h
+++ b/fs/nfsd/xdr3.h
@@ -273,6 +273,7 @@ union nfsd3_xdrstore {
#define NFS3_SVC_XDRSIZE sizeof(union nfsd3_xdrstore)
+int nfs3svc_decode_voidarg(struct svc_rqst *, __be32 *);
int nfs3svc_decode_fhandle(struct svc_rqst *, __be32 *);
int nfs3svc_decode_sattrargs(struct svc_rqst *, __be32 *);
int nfs3svc_decode_diropargs(struct svc_rqst *, __be32 *);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 66499fb6b567..679d40af1bbb 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -781,6 +781,7 @@ set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
bool nfsd4_mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp);
+int nfs4svc_decode_voidarg(struct svc_rqst *, __be32 *);
int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *);
int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *);
int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 2eee5fb1a882..4abd928b0bc8 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -651,8 +651,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_files = nmaxinodes;
buf->f_ffree = nfreeinodes;
buf->f_namelen = NILFS_NAME_LEN;
- buf->f_fsid.val[0] = (u32)id;
- buf->f_fsid.val[1] = (u32)(id >> 32);
+ buf->f_fsid = u64_to_fsid(id);
return 0;
}
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 7dc3bc604f78..0d7e948cb29c 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2643,8 +2643,7 @@ static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs)
* the least significant 32-bits in f_fsid[0] and the most significant
* 32-bits in f_fsid[1].
*/
- sfs->f_fsid.val[0] = vol->serial_no & 0xffffffff;
- sfs->f_fsid.val[1] = (vol->serial_no >> 32) & 0xffffffff;
+ sfs->f_fsid = u64_to_fsid(vol->serial_no);
/* Maximum length of filenames. */
sfs->f_namelen = NTFS_MAX_NAME_LEN;
return 0;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index b425f0b01dce..b9a9d69dde7e 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -883,6 +883,10 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
OCFS2_JOURNAL_DIRTY_FL);
journal->j_journal = j_journal;
+ journal->j_journal->j_submit_inode_data_buffers =
+ jbd2_journal_submit_inode_data_buffers;
+ journal->j_journal->j_finish_inode_data_buffers =
+ jbd2_journal_finish_inode_data_buffers;
journal->j_inode = inode;
journal->j_bh = bh;
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index b76ec6b88ded..ce93ccca8639 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -282,8 +282,7 @@ static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_blocks = sbi->s_num_blocks;
buf->f_files = sbi->s_num_blocks;
buf->f_namelen = OMFS_NAMELEN;
- buf->f_fsid.val[0] = (u32)id;
- buf->f_fsid.val[1] = (u32)(id >> 32);
+ buf->f_fsid = u64_to_fsid(id);
buf->f_bfree = buf->f_bavail = buf->f_ffree =
omfs_count_free(s);
@@ -363,12 +362,11 @@ static int omfs_get_imap(struct super_block *sb)
bh = sb_bread(sb, block++);
if (!bh)
goto nomem_free;
- *ptr = kmalloc(sb->s_blocksize, GFP_KERNEL);
+ *ptr = kmemdup(bh->b_data, sb->s_blocksize, GFP_KERNEL);
if (!*ptr) {
brelse(bh);
goto nomem_free;
}
- memcpy(*ptr, bh->b_data, sb->s_blocksize);
if (count < sb->s_blocksize)
memset((void *)*ptr + count, 0xff,
sb->s_blocksize - count);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index aa69c35d904c..0f707003dda5 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1268,6 +1268,10 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
kuid_t kloginuid;
int rv;
+ /* Don't let kthreads write their own loginuid */
+ if (current->flags & PF_KTHREAD)
+ return -EPERM;
+
rcu_read_lock();
if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
rcu_read_unlock();
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 28d6105e908e..58c075e2a452 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -297,6 +297,21 @@ static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
return rv;
}
+static ssize_t proc_reg_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct proc_dir_entry *pde = PDE(file_inode(iocb->ki_filp));
+ ssize_t ret;
+
+ if (pde_is_permanent(pde))
+ return pde->proc_ops->proc_read_iter(iocb, iter);
+
+ if (!use_pde(pde))
+ return -EIO;
+ ret = pde->proc_ops->proc_read_iter(iocb, iter);
+ unuse_pde(pde);
+ return ret;
+}
+
static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
typeof_member(struct proc_ops, proc_read) read;
@@ -572,9 +587,18 @@ static const struct file_operations proc_reg_file_ops = {
.write = proc_reg_write,
.poll = proc_reg_poll,
.unlocked_ioctl = proc_reg_unlocked_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = proc_reg_compat_ioctl,
-#endif
+ .mmap = proc_reg_mmap,
+ .get_unmapped_area = proc_reg_get_unmapped_area,
+ .open = proc_reg_open,
+ .release = proc_reg_release,
+};
+
+static const struct file_operations proc_iter_file_ops = {
+ .llseek = proc_reg_llseek,
+ .read_iter = proc_reg_read_iter,
+ .write = proc_reg_write,
+ .poll = proc_reg_poll,
+ .unlocked_ioctl = proc_reg_unlocked_ioctl,
.mmap = proc_reg_mmap,
.get_unmapped_area = proc_reg_get_unmapped_area,
.open = proc_reg_open,
@@ -582,12 +606,26 @@ static const struct file_operations proc_reg_file_ops = {
};
#ifdef CONFIG_COMPAT
-static const struct file_operations proc_reg_file_ops_no_compat = {
+static const struct file_operations proc_reg_file_ops_compat = {
.llseek = proc_reg_llseek,
.read = proc_reg_read,
.write = proc_reg_write,
.poll = proc_reg_poll,
.unlocked_ioctl = proc_reg_unlocked_ioctl,
+ .compat_ioctl = proc_reg_compat_ioctl,
+ .mmap = proc_reg_mmap,
+ .get_unmapped_area = proc_reg_get_unmapped_area,
+ .open = proc_reg_open,
+ .release = proc_reg_release,
+};
+
+static const struct file_operations proc_iter_file_ops_compat = {
+ .llseek = proc_reg_llseek,
+ .read_iter = proc_reg_read_iter,
+ .write = proc_reg_write,
+ .poll = proc_reg_poll,
+ .unlocked_ioctl = proc_reg_unlocked_ioctl,
+ .compat_ioctl = proc_reg_compat_ioctl,
.mmap = proc_reg_mmap,
.get_unmapped_area = proc_reg_get_unmapped_area,
.open = proc_reg_open,
@@ -619,42 +657,51 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
{
struct inode *inode = new_inode(sb);
- if (inode) {
- inode->i_ino = de->low_ino;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
- PROC_I(inode)->pde = de;
+ if (!inode) {
+ pde_put(de);
+ return NULL;
+ }
- if (is_empty_pde(de)) {
- make_empty_dir_inode(inode);
- return inode;
- }
- if (de->mode) {
- inode->i_mode = de->mode;
- inode->i_uid = de->uid;
- inode->i_gid = de->gid;
- }
- if (de->size)
- inode->i_size = de->size;
- if (de->nlink)
- set_nlink(inode, de->nlink);
+ inode->i_ino = de->low_ino;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ PROC_I(inode)->pde = de;
+ if (is_empty_pde(de)) {
+ make_empty_dir_inode(inode);
+ return inode;
+ }
- if (S_ISREG(inode->i_mode)) {
- inode->i_op = de->proc_iops;
+ if (de->mode) {
+ inode->i_mode = de->mode;
+ inode->i_uid = de->uid;
+ inode->i_gid = de->gid;
+ }
+ if (de->size)
+ inode->i_size = de->size;
+ if (de->nlink)
+ set_nlink(inode, de->nlink);
+
+ if (S_ISREG(inode->i_mode)) {
+ inode->i_op = de->proc_iops;
+ if (de->proc_ops->proc_read_iter)
+ inode->i_fop = &proc_iter_file_ops;
+ else
inode->i_fop = &proc_reg_file_ops;
#ifdef CONFIG_COMPAT
- if (!de->proc_ops->proc_compat_ioctl) {
- inode->i_fop = &proc_reg_file_ops_no_compat;
- }
+ if (de->proc_ops->proc_compat_ioctl) {
+ if (de->proc_ops->proc_read_iter)
+ inode->i_fop = &proc_iter_file_ops_compat;
+ else
+ inode->i_fop = &proc_reg_file_ops_compat;
+ }
#endif
- } else if (S_ISDIR(inode->i_mode)) {
- inode->i_op = de->proc_iops;
- inode->i_fop = de->proc_dir_ops;
- } else if (S_ISLNK(inode->i_mode)) {
- inode->i_op = de->proc_iops;
- inode->i_fop = NULL;
- } else
- BUG();
- } else
- pde_put(de);
+ } else if (S_ISDIR(inode->i_mode)) {
+ inode->i_op = de->proc_iops;
+ inode->i_fop = de->proc_dir_ops;
+ } else if (S_ISLNK(inode->i_mode)) {
+ inode->i_op = de->proc_iops;
+ inode->i_fop = NULL;
+ } else {
+ BUG();
+ }
return inode;
}
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 6c1166ccdaea..317899222d7f 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -12,6 +12,7 @@
#include <linux/cred.h>
#include <linux/namei.h>
#include <linux/mm.h>
+#include <linux/uio.h>
#include <linux/module.h>
#include <linux/bpf-cgroup.h>
#include <linux/mount.h>
@@ -540,13 +541,14 @@ out:
return err;
}
-static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf,
- size_t count, loff_t *ppos, int write)
+static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter,
+ int write)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(iocb->ki_filp);
struct ctl_table_header *head = grab_header(inode);
struct ctl_table *table = PROC_I(inode)->sysctl_entry;
- void *kbuf;
+ size_t count = iov_iter_count(iter);
+ char *kbuf;
ssize_t error;
if (IS_ERR(head))
@@ -569,32 +571,30 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf,
error = -ENOMEM;
if (count >= KMALLOC_MAX_SIZE)
goto out;
+ kbuf = kzalloc(count + 1, GFP_KERNEL);
+ if (!kbuf)
+ goto out;
if (write) {
- kbuf = memdup_user_nul(ubuf, count);
- if (IS_ERR(kbuf)) {
- error = PTR_ERR(kbuf);
- goto out;
- }
- } else {
- kbuf = kzalloc(count, GFP_KERNEL);
- if (!kbuf)
- goto out;
+ error = -EFAULT;
+ if (!copy_from_iter_full(kbuf, count, iter))
+ goto out_free_buf;
+ kbuf[count] = '\0';
}
error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, &kbuf, &count,
- ppos);
+ &iocb->ki_pos);
if (error)
goto out_free_buf;
/* careful: calling conventions are nasty here */
- error = table->proc_handler(table, write, kbuf, &count, ppos);
+ error = table->proc_handler(table, write, kbuf, &count, &iocb->ki_pos);
if (error)
goto out_free_buf;
if (!write) {
error = -EFAULT;
- if (copy_to_user(ubuf, kbuf, count))
+ if (copy_to_iter(kbuf, count, iter) < count)
goto out_free_buf;
}
@@ -607,16 +607,14 @@ out:
return error;
}
-static ssize_t proc_sys_read(struct file *filp, char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t proc_sys_read(struct kiocb *iocb, struct iov_iter *iter)
{
- return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 0);
+ return proc_sys_call_handler(iocb, iter, 0);
}
-static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t proc_sys_write(struct kiocb *iocb, struct iov_iter *iter)
{
- return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1);
+ return proc_sys_call_handler(iocb, iter, 1);
}
static int proc_sys_open(struct inode *inode, struct file *filp)
@@ -853,8 +851,10 @@ static int proc_sys_getattr(const struct path *path, struct kstat *stat,
static const struct file_operations proc_sys_file_operations = {
.open = proc_sys_open,
.poll = proc_sys_poll,
- .read = proc_sys_read,
- .write = proc_sys_write,
+ .read_iter = proc_sys_read,
+ .write_iter = proc_sys_write,
+ .splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
.llseek = default_llseek,
};
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 3059a9394c2d..e59d4bb3a89e 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -70,6 +70,7 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
{ MNT_NOATIME, ",noatime" },
{ MNT_NODIRATIME, ",nodiratime" },
{ MNT_RELATIME, ",relatime" },
+ { MNT_NOSYMFOLLOW, ",nosymfollow" },
{ 0, NULL }
};
const struct proc_fs_opts *fs_infop;
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index e8da1cde87b9..3fb7fc819b4f 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -137,8 +137,7 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bfree = qnx4_count_free_blocks(sb);
buf->f_bavail = buf->f_bfree;
buf->f_namelen = QNX4_NAME_MAX;
- buf->f_fsid.val[0] = (u32)id;
- buf->f_fsid.val[1] = (u32)(id >> 32);
+ buf->f_fsid = u64_to_fsid(id);
return 0;
}
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 755293c8c71a..61191f7bdf62 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -166,8 +166,7 @@ static int qnx6_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_ffree = fs32_to_cpu(sbi, sbi->sb->sb_free_inodes);
buf->f_bavail = buf->f_bfree;
buf->f_namelen = QNX6_LONG_NAME_MAX;
- buf->f_fsid.val[0] = (u32)id;
- buf->f_fsid.val[1] = (u32)(id >> 32);
+ buf->f_fsid = u64_to_fsid(id);
return 0;
}
diff --git a/fs/read_write.c b/fs/read_write.c
index 19f5c4bf75aa..75f764b43418 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -419,27 +419,42 @@ static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo
return ret;
}
+static int warn_unsupported(struct file *file, const char *op)
+{
+ pr_warn_ratelimited(
+ "kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
+ op, file, current->pid, current->comm);
+ return -EINVAL;
+}
+
ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
{
- mm_segment_t old_fs = get_fs();
+ struct kvec iov = {
+ .iov_base = buf,
+ .iov_len = min_t(size_t, count, MAX_RW_COUNT),
+ };
+ struct kiocb kiocb;
+ struct iov_iter iter;
ssize_t ret;
if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ)))
return -EINVAL;
if (!(file->f_mode & FMODE_CAN_READ))
return -EINVAL;
+ /*
+ * Also fail if ->read_iter and ->read are both wired up as that
+ * implies very convoluted semantics.
+ */
+ if (unlikely(!file->f_op->read_iter || file->f_op->read))
+ return warn_unsupported(file, "read");
- if (count > MAX_RW_COUNT)
- count = MAX_RW_COUNT;
- set_fs(KERNEL_DS);
- if (file->f_op->read)
- ret = file->f_op->read(file, (void __user *)buf, count, pos);
- else if (file->f_op->read_iter)
- ret = new_sync_read(file, (void __user *)buf, count, pos);
- else
- ret = -EINVAL;
- set_fs(old_fs);
+ init_sync_kiocb(&kiocb, file);
+ kiocb.ki_pos = pos ? *pos : 0;
+ iov_iter_kvec(&iter, READ, &iov, 1, iov.iov_len);
+ ret = file->f_op->read_iter(&kiocb, &iter);
if (ret > 0) {
+ if (pos)
+ *pos = kiocb.ki_pos;
fsnotify_access(file);
add_rchar(current, ret);
}
@@ -510,28 +525,32 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t
/* caller is responsible for file_start_write/file_end_write */
ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
{
- mm_segment_t old_fs;
- const char __user *p;
+ struct kvec iov = {
+ .iov_base = (void *)buf,
+ .iov_len = min_t(size_t, count, MAX_RW_COUNT),
+ };
+ struct kiocb kiocb;
+ struct iov_iter iter;
ssize_t ret;
if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
return -EBADF;
if (!(file->f_mode & FMODE_CAN_WRITE))
return -EINVAL;
+ /*
+ * Also fail if ->write_iter and ->write are both wired up as that
+ * implies very convoluted semantics.
+ */
+ if (unlikely(!file->f_op->write_iter || file->f_op->write))
+ return warn_unsupported(file, "write");
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- p = (__force const char __user *)buf;
- if (count > MAX_RW_COUNT)
- count = MAX_RW_COUNT;
- if (file->f_op->write)
- ret = file->f_op->write(file, p, count, pos);
- else if (file->f_op->write_iter)
- ret = new_sync_write(file, p, count, pos);
- else
- ret = -EINVAL;
- set_fs(old_fs);
+ init_sync_kiocb(&kiocb, file);
+ kiocb.ki_pos = pos ? *pos : 0;
+ iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len);
+ ret = file->f_op->write_iter(&kiocb, &iter);
if (ret > 0) {
+ if (pos)
+ *pos = kiocb.ki_pos;
fsnotify_modify(file);
add_wchar(current, ret);
}
@@ -889,7 +908,7 @@ ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
}
EXPORT_SYMBOL(vfs_iter_write);
-ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
+static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
unsigned long vlen, loff_t *pos, rwf_t flags)
{
struct iovec iovstack[UIO_FASTIOV];
@@ -1392,6 +1411,59 @@ static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
}
/*
+ * Performs necessary checks before doing a file copy
+ *
+ * Can adjust amount of bytes to copy via @req_count argument.
+ * Returns appropriate error code that caller should return or
+ * zero in case the copy should be allowed.
+ */
+static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t *req_count, unsigned int flags)
+{
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
+ uint64_t count = *req_count;
+ loff_t size_in;
+ int ret;
+
+ ret = generic_file_rw_checks(file_in, file_out);
+ if (ret)
+ return ret;
+
+ /* Don't touch certain kinds of inodes */
+ if (IS_IMMUTABLE(inode_out))
+ return -EPERM;
+
+ if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
+ return -ETXTBSY;
+
+ /* Ensure offsets don't wrap. */
+ if (pos_in + count < pos_in || pos_out + count < pos_out)
+ return -EOVERFLOW;
+
+ /* Shorten the copy to EOF */
+ size_in = i_size_read(inode_in);
+ if (pos_in >= size_in)
+ count = 0;
+ else
+ count = min(count, size_in - (uint64_t)pos_in);
+
+ ret = generic_write_check_limits(file_out, pos_out, &count);
+ if (ret)
+ return ret;
+
+ /* Don't allow overlapped copying within the same file. */
+ if (inode_in == inode_out &&
+ pos_out + count > pos_in &&
+ pos_out < pos_in + count)
+ return -EINVAL;
+
+ *req_count = count;
+ return 0;
+}
+
+/*
* copy_file_range() differs from regular file read and write in that it
* specifically allows return partial success. When it does so is up to
* the copy_file_range method.
@@ -1523,475 +1595,92 @@ out2:
return ret;
}
-static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
- bool write)
-{
- struct inode *inode = file_inode(file);
-
- if (unlikely(pos < 0 || len < 0))
- return -EINVAL;
-
- if (unlikely((loff_t) (pos + len) < 0))
- return -EINVAL;
-
- if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
- loff_t end = len ? pos + len - 1 : OFFSET_MAX;
- int retval;
-
- retval = locks_mandatory_area(inode, file, pos, end,
- write ? F_WRLCK : F_RDLCK);
- if (retval < 0)
- return retval;
- }
-
- return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
-}
-/*
- * Ensure that we don't remap a partial EOF block in the middle of something
- * else. Assume that the offsets have already been checked for block
- * alignment.
- *
- * For clone we only link a partial EOF block above or at the destination file's
- * EOF. For deduplication we accept a partial EOF block only if it ends at the
- * destination file's EOF (can not link it into the middle of a file).
- *
- * Shorten the request if possible.
- */
-static int generic_remap_check_len(struct inode *inode_in,
- struct inode *inode_out,
- loff_t pos_out,
- loff_t *len,
- unsigned int remap_flags)
-{
- u64 blkmask = i_blocksize(inode_in) - 1;
- loff_t new_len = *len;
-
- if ((*len & blkmask) == 0)
- return 0;
-
- if (pos_out + *len < i_size_read(inode_out))
- new_len &= ~blkmask;
-
- if (new_len == *len)
- return 0;
-
- if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
- *len = new_len;
- return 0;
- }
-
- return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
-}
-
-/* Read a page's worth of file data into the page cache. */
-static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
-{
- struct page *page;
-
- page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL);
- if (IS_ERR(page))
- return page;
- if (!PageUptodate(page)) {
- put_page(page);
- return ERR_PTR(-EIO);
- }
- return page;
-}
-
/*
- * Lock two pages, ensuring that we lock in offset order if the pages are from
- * the same file.
+ * Don't operate on ranges the page cache doesn't support, and don't exceed the
+ * LFS limits. If pos is under the limit it becomes a short access. If it
+ * exceeds the limit we return -EFBIG.
*/
-static void vfs_lock_two_pages(struct page *page1, struct page *page2)
+int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count)
{
- /* Always lock in order of increasing index. */
- if (page1->index > page2->index)
- swap(page1, page2);
-
- lock_page(page1);
- if (page1 != page2)
- lock_page(page2);
-}
-
-/* Unlock two pages, being careful not to unlock the same page twice. */
-static void vfs_unlock_two_pages(struct page *page1, struct page *page2)
-{
- unlock_page(page1);
- if (page1 != page2)
- unlock_page(page2);
-}
-
-/*
- * Compare extents of two files to see if they are the same.
- * Caller must have locked both inodes to prevent write races.
- */
-static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
- struct inode *dest, loff_t destoff,
- loff_t len, bool *is_same)
-{
- loff_t src_poff;
- loff_t dest_poff;
- void *src_addr;
- void *dest_addr;
- struct page *src_page;
- struct page *dest_page;
- loff_t cmp_len;
- bool same;
- int error;
-
- error = -EINVAL;
- same = true;
- while (len) {
- src_poff = srcoff & (PAGE_SIZE - 1);
- dest_poff = destoff & (PAGE_SIZE - 1);
- cmp_len = min(PAGE_SIZE - src_poff,
- PAGE_SIZE - dest_poff);
- cmp_len = min(cmp_len, len);
- if (cmp_len <= 0)
- goto out_error;
-
- src_page = vfs_dedupe_get_page(src, srcoff);
- if (IS_ERR(src_page)) {
- error = PTR_ERR(src_page);
- goto out_error;
- }
- dest_page = vfs_dedupe_get_page(dest, destoff);
- if (IS_ERR(dest_page)) {
- error = PTR_ERR(dest_page);
- put_page(src_page);
- goto out_error;
- }
-
- vfs_lock_two_pages(src_page, dest_page);
+ struct inode *inode = file->f_mapping->host;
+ loff_t max_size = inode->i_sb->s_maxbytes;
+ loff_t limit = rlimit(RLIMIT_FSIZE);
- /*
- * Now that we've locked both pages, make sure they're still
- * mapped to the file data we're interested in. If not,
- * someone is invalidating pages on us and we lose.
- */
- if (!PageUptodate(src_page) || !PageUptodate(dest_page) ||
- src_page->mapping != src->i_mapping ||
- dest_page->mapping != dest->i_mapping) {
- same = false;
- goto unlock;
+ if (limit != RLIM_INFINITY) {
+ if (pos >= limit) {
+ send_sig(SIGXFSZ, current, 0);
+ return -EFBIG;
}
+ *count = min(*count, limit - pos);
+ }
- src_addr = kmap_atomic(src_page);
- dest_addr = kmap_atomic(dest_page);
-
- flush_dcache_page(src_page);
- flush_dcache_page(dest_page);
-
- if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
- same = false;
+ if (!(file->f_flags & O_LARGEFILE))
+ max_size = MAX_NON_LFS;
- kunmap_atomic(dest_addr);
- kunmap_atomic(src_addr);
-unlock:
- vfs_unlock_two_pages(src_page, dest_page);
- put_page(dest_page);
- put_page(src_page);
+ if (unlikely(pos >= max_size))
+ return -EFBIG;
- if (!same)
- break;
+ *count = min(*count, max_size - pos);
- srcoff += cmp_len;
- destoff += cmp_len;
- len -= cmp_len;
- }
-
- *is_same = same;
return 0;
-
-out_error:
- return error;
}
/*
- * Check that the two inodes are eligible for cloning, the ranges make
- * sense, and then flush all dirty data. Caller must ensure that the
- * inodes have been locked against any other modifications.
+ * Performs necessary checks before doing a write
*
- * If there's an error, then the usual negative error code is returned.
- * Otherwise returns 0 with *len set to the request length.
+ * Can adjust writing position or amount of bytes to write.
+ * Returns appropriate error code that caller should return or
+ * zero in case that write should be allowed.
*/
-int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *len, unsigned int remap_flags)
+ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
{
- struct inode *inode_in = file_inode(file_in);
- struct inode *inode_out = file_inode(file_out);
- bool same_inode = (inode_in == inode_out);
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ loff_t count;
int ret;
- /* Don't touch certain kinds of inodes */
- if (IS_IMMUTABLE(inode_out))
- return -EPERM;
-
- if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
+ if (IS_SWAPFILE(inode))
return -ETXTBSY;
- /* Don't reflink dirs, pipes, sockets... */
- if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
- return -EISDIR;
- if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
- return -EINVAL;
-
- /* Zero length dedupe exits immediately; reflink goes to EOF. */
- if (*len == 0) {
- loff_t isize = i_size_read(inode_in);
-
- if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
- return 0;
- if (pos_in > isize)
- return -EINVAL;
- *len = isize - pos_in;
- if (*len == 0)
- return 0;
- }
-
- /* Check that we don't violate system file offset limits. */
- ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
- remap_flags);
- if (ret)
- return ret;
-
- /* Wait for the completion of any pending IOs on both files */
- inode_dio_wait(inode_in);
- if (!same_inode)
- inode_dio_wait(inode_out);
-
- ret = filemap_write_and_wait_range(inode_in->i_mapping,
- pos_in, pos_in + *len - 1);
- if (ret)
- return ret;
-
- ret = filemap_write_and_wait_range(inode_out->i_mapping,
- pos_out, pos_out + *len - 1);
- if (ret)
- return ret;
-
- /*
- * Check that the extents are the same.
- */
- if (remap_flags & REMAP_FILE_DEDUP) {
- bool is_same = false;
-
- ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
- inode_out, pos_out, *len, &is_same);
- if (ret)
- return ret;
- if (!is_same)
- return -EBADE;
- }
-
- ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
- remap_flags);
- if (ret)
- return ret;
-
- /* If can't alter the file contents, we're done. */
- if (!(remap_flags & REMAP_FILE_DEDUP))
- ret = file_modified(file_out);
-
- return ret;
-}
-EXPORT_SYMBOL(generic_remap_file_range_prep);
-
-loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t len, unsigned int remap_flags)
-{
- loff_t ret;
-
- WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP);
-
- /*
- * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
- * the same mount. Practically, they only need to be on the same file
- * system.
- */
- if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
- return -EXDEV;
-
- ret = generic_file_rw_checks(file_in, file_out);
- if (ret < 0)
- return ret;
-
- if (!file_in->f_op->remap_file_range)
- return -EOPNOTSUPP;
-
- ret = remap_verify_area(file_in, pos_in, len, false);
- if (ret)
- return ret;
-
- ret = remap_verify_area(file_out, pos_out, len, true);
- if (ret)
- return ret;
-
- ret = file_in->f_op->remap_file_range(file_in, pos_in,
- file_out, pos_out, len, remap_flags);
- if (ret < 0)
- return ret;
-
- fsnotify_access(file_in);
- fsnotify_modify(file_out);
- return ret;
-}
-EXPORT_SYMBOL(do_clone_file_range);
-
-loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t len, unsigned int remap_flags)
-{
- loff_t ret;
-
- file_start_write(file_out);
- ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len,
- remap_flags);
- file_end_write(file_out);
-
- return ret;
-}
-EXPORT_SYMBOL(vfs_clone_file_range);
-
-/* Check whether we are allowed to dedupe the destination file */
-static bool allow_file_dedupe(struct file *file)
-{
- if (capable(CAP_SYS_ADMIN))
- return true;
- if (file->f_mode & FMODE_WRITE)
- return true;
- if (uid_eq(current_fsuid(), file_inode(file)->i_uid))
- return true;
- if (!inode_permission(file_inode(file), MAY_WRITE))
- return true;
- return false;
-}
+ if (!iov_iter_count(from))
+ return 0;
-loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
- struct file *dst_file, loff_t dst_pos,
- loff_t len, unsigned int remap_flags)
-{
- loff_t ret;
+ /* FIXME: this is for backwards compatibility with 2.4 */
+ if (iocb->ki_flags & IOCB_APPEND)
+ iocb->ki_pos = i_size_read(inode);
- WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP |
- REMAP_FILE_CAN_SHORTEN));
+ if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
+ return -EINVAL;
- ret = mnt_want_write_file(dst_file);
+ count = iov_iter_count(from);
+ ret = generic_write_check_limits(file, iocb->ki_pos, &count);
if (ret)
return ret;
- ret = remap_verify_area(dst_file, dst_pos, len, true);
- if (ret < 0)
- goto out_drop_write;
-
- ret = -EPERM;
- if (!allow_file_dedupe(dst_file))
- goto out_drop_write;
-
- ret = -EXDEV;
- if (src_file->f_path.mnt != dst_file->f_path.mnt)
- goto out_drop_write;
-
- ret = -EISDIR;
- if (S_ISDIR(file_inode(dst_file)->i_mode))
- goto out_drop_write;
-
- ret = -EINVAL;
- if (!dst_file->f_op->remap_file_range)
- goto out_drop_write;
-
- if (len == 0) {
- ret = 0;
- goto out_drop_write;
- }
-
- ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file,
- dst_pos, len, remap_flags | REMAP_FILE_DEDUP);
-out_drop_write:
- mnt_drop_write_file(dst_file);
-
- return ret;
+ iov_iter_truncate(from, count);
+ return iov_iter_count(from);
}
-EXPORT_SYMBOL(vfs_dedupe_file_range_one);
+EXPORT_SYMBOL(generic_write_checks);
-int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
+/*
+ * Performs common checks before doing a file copy/clone
+ * from @file_in to @file_out.
+ */
+int generic_file_rw_checks(struct file *file_in, struct file *file_out)
{
- struct file_dedupe_range_info *info;
- struct inode *src = file_inode(file);
- u64 off;
- u64 len;
- int i;
- int ret;
- u16 count = same->dest_count;
- loff_t deduped;
-
- if (!(file->f_mode & FMODE_READ))
- return -EINVAL;
-
- if (same->reserved1 || same->reserved2)
- return -EINVAL;
-
- off = same->src_offset;
- len = same->src_length;
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
- if (S_ISDIR(src->i_mode))
+ /* Don't copy dirs, pipes, sockets... */
+ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
return -EISDIR;
-
- if (!S_ISREG(src->i_mode))
- return -EINVAL;
-
- if (!file->f_op->remap_file_range)
- return -EOPNOTSUPP;
-
- ret = remap_verify_area(file, off, len, false);
- if (ret < 0)
- return ret;
- ret = 0;
-
- if (off + len > i_size_read(src))
+ if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
return -EINVAL;
- /* Arbitrary 1G limit on a single dedupe request, can be raised. */
- len = min_t(u64, len, 1 << 30);
-
- /* pre-format output fields to sane values */
- for (i = 0; i < count; i++) {
- same->info[i].bytes_deduped = 0ULL;
- same->info[i].status = FILE_DEDUPE_RANGE_SAME;
- }
-
- for (i = 0, info = same->info; i < count; i++, info++) {
- struct fd dst_fd = fdget(info->dest_fd);
- struct file *dst_file = dst_fd.file;
-
- if (!dst_file) {
- info->status = -EBADF;
- goto next_loop;
- }
-
- if (info->reserved) {
- info->status = -EINVAL;
- goto next_fdput;
- }
-
- deduped = vfs_dedupe_file_range_one(file, off, dst_file,
- info->dest_offset, len,
- REMAP_FILE_CAN_SHORTEN);
- if (deduped == -EBADE)
- info->status = FILE_DEDUPE_RANGE_DIFFERS;
- else if (deduped < 0)
- info->status = deduped;
- else
- info->bytes_deduped = len;
+ if (!(file_in->f_mode & FMODE_READ) ||
+ !(file_out->f_mode & FMODE_WRITE) ||
+ (file_out->f_flags & O_APPEND))
+ return -EBADF;
-next_fdput:
- fdput(dst_fd);
-next_loop:
- if (fatal_signal_pending(current))
- break;
- }
- return ret;
+ return 0;
}
-EXPORT_SYMBOL(vfs_dedupe_file_range);
diff --git a/fs/remap_range.c b/fs/remap_range.c
new file mode 100644
index 000000000000..e6099beefa97
--- /dev/null
+++ b/fs/remap_range.c
@@ -0,0 +1,571 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/sched/xacct.h>
+#include <linux/fcntl.h>
+#include <linux/file.h>
+#include <linux/uio.h>
+#include <linux/fsnotify.h>
+#include <linux/security.h>
+#include <linux/export.h>
+#include <linux/syscalls.h>
+#include <linux/pagemap.h>
+#include <linux/splice.h>
+#include <linux/compat.h>
+#include <linux/mount.h>
+#include <linux/fs.h>
+#include "internal.h"
+
+#include <linux/uaccess.h>
+#include <asm/unistd.h>
+
+/*
+ * Performs necessary checks before doing a clone.
+ *
+ * Can adjust amount of bytes to clone via @req_count argument.
+ * Returns appropriate error code that caller should return or
+ * zero in case the clone should be allowed.
+ */
+static int generic_remap_checks(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *req_count, unsigned int remap_flags)
+{
+ struct inode *inode_in = file_in->f_mapping->host;
+ struct inode *inode_out = file_out->f_mapping->host;
+ uint64_t count = *req_count;
+ uint64_t bcount;
+ loff_t size_in, size_out;
+ loff_t bs = inode_out->i_sb->s_blocksize;
+ int ret;
+
+ /* The start of both ranges must be aligned to an fs block. */
+ if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
+ return -EINVAL;
+
+ /* Ensure offsets don't wrap. */
+ if (pos_in + count < pos_in || pos_out + count < pos_out)
+ return -EINVAL;
+
+ size_in = i_size_read(inode_in);
+ size_out = i_size_read(inode_out);
+
+ /* Dedupe requires both ranges to be within EOF. */
+ if ((remap_flags & REMAP_FILE_DEDUP) &&
+ (pos_in >= size_in || pos_in + count > size_in ||
+ pos_out >= size_out || pos_out + count > size_out))
+ return -EINVAL;
+
+ /* Ensure the infile range is within the infile. */
+ if (pos_in >= size_in)
+ return -EINVAL;
+ count = min(count, size_in - (uint64_t)pos_in);
+
+ ret = generic_write_check_limits(file_out, pos_out, &count);
+ if (ret)
+ return ret;
+
+ /*
+ * If the user wanted us to link to the infile's EOF, round up to the
+ * next block boundary for this check.
+ *
+ * Otherwise, make sure the count is also block-aligned, having
+ * already confirmed the starting offsets' block alignment.
+ */
+ if (pos_in + count == size_in) {
+ bcount = ALIGN(size_in, bs) - pos_in;
+ } else {
+ if (!IS_ALIGNED(count, bs))
+ count = ALIGN_DOWN(count, bs);
+ bcount = count;
+ }
+
+ /* Don't allow overlapped cloning within the same file. */
+ if (inode_in == inode_out &&
+ pos_out + bcount > pos_in &&
+ pos_out < pos_in + bcount)
+ return -EINVAL;
+
+ /*
+ * We shortened the request but the caller can't deal with that, so
+ * bounce the request back to userspace.
+ */
+ if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
+ return -EINVAL;
+
+ *req_count = count;
+ return 0;
+}
+
+static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
+ bool write)
+{
+ struct inode *inode = file_inode(file);
+
+ if (unlikely(pos < 0 || len < 0))
+ return -EINVAL;
+
+ if (unlikely((loff_t) (pos + len) < 0))
+ return -EINVAL;
+
+ if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
+ loff_t end = len ? pos + len - 1 : OFFSET_MAX;
+ int retval;
+
+ retval = locks_mandatory_area(inode, file, pos, end,
+ write ? F_WRLCK : F_RDLCK);
+ if (retval < 0)
+ return retval;
+ }
+
+ return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
+}
+
+/*
+ * Ensure that we don't remap a partial EOF block in the middle of something
+ * else. Assume that the offsets have already been checked for block
+ * alignment.
+ *
+ * For clone we only link a partial EOF block above or at the destination file's
+ * EOF. For deduplication we accept a partial EOF block only if it ends at the
+ * destination file's EOF (can not link it into the middle of a file).
+ *
+ * Shorten the request if possible.
+ */
+static int generic_remap_check_len(struct inode *inode_in,
+ struct inode *inode_out,
+ loff_t pos_out,
+ loff_t *len,
+ unsigned int remap_flags)
+{
+ u64 blkmask = i_blocksize(inode_in) - 1;
+ loff_t new_len = *len;
+
+ if ((*len & blkmask) == 0)
+ return 0;
+
+ if (pos_out + *len < i_size_read(inode_out))
+ new_len &= ~blkmask;
+
+ if (new_len == *len)
+ return 0;
+
+ if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
+ *len = new_len;
+ return 0;
+ }
+
+ return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
+}
+
+/* Read a page's worth of file data into the page cache. */
+static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
+{
+ struct page *page;
+
+ page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL);
+ if (IS_ERR(page))
+ return page;
+ if (!PageUptodate(page)) {
+ put_page(page);
+ return ERR_PTR(-EIO);
+ }
+ return page;
+}
+
+/*
+ * Lock two pages, ensuring that we lock in offset order if the pages are from
+ * the same file.
+ */
+static void vfs_lock_two_pages(struct page *page1, struct page *page2)
+{
+ /* Always lock in order of increasing index. */
+ if (page1->index > page2->index)
+ swap(page1, page2);
+
+ lock_page(page1);
+ if (page1 != page2)
+ lock_page(page2);
+}
+
+/* Unlock two pages, being careful not to unlock the same page twice. */
+static void vfs_unlock_two_pages(struct page *page1, struct page *page2)
+{
+ unlock_page(page1);
+ if (page1 != page2)
+ unlock_page(page2);
+}
+
+/*
+ * Compare extents of two files to see if they are the same.
+ * Caller must have locked both inodes to prevent write races.
+ */
+static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+ struct inode *dest, loff_t destoff,
+ loff_t len, bool *is_same)
+{
+ loff_t src_poff;
+ loff_t dest_poff;
+ void *src_addr;
+ void *dest_addr;
+ struct page *src_page;
+ struct page *dest_page;
+ loff_t cmp_len;
+ bool same;
+ int error;
+
+ error = -EINVAL;
+ same = true;
+ while (len) {
+ src_poff = srcoff & (PAGE_SIZE - 1);
+ dest_poff = destoff & (PAGE_SIZE - 1);
+ cmp_len = min(PAGE_SIZE - src_poff,
+ PAGE_SIZE - dest_poff);
+ cmp_len = min(cmp_len, len);
+ if (cmp_len <= 0)
+ goto out_error;
+
+ src_page = vfs_dedupe_get_page(src, srcoff);
+ if (IS_ERR(src_page)) {
+ error = PTR_ERR(src_page);
+ goto out_error;
+ }
+ dest_page = vfs_dedupe_get_page(dest, destoff);
+ if (IS_ERR(dest_page)) {
+ error = PTR_ERR(dest_page);
+ put_page(src_page);
+ goto out_error;
+ }
+
+ vfs_lock_two_pages(src_page, dest_page);
+
+ /*
+ * Now that we've locked both pages, make sure they're still
+ * mapped to the file data we're interested in. If not,
+ * someone is invalidating pages on us and we lose.
+ */
+ if (!PageUptodate(src_page) || !PageUptodate(dest_page) ||
+ src_page->mapping != src->i_mapping ||
+ dest_page->mapping != dest->i_mapping) {
+ same = false;
+ goto unlock;
+ }
+
+ src_addr = kmap_atomic(src_page);
+ dest_addr = kmap_atomic(dest_page);
+
+ flush_dcache_page(src_page);
+ flush_dcache_page(dest_page);
+
+ if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
+ same = false;
+
+ kunmap_atomic(dest_addr);
+ kunmap_atomic(src_addr);
+unlock:
+ vfs_unlock_two_pages(src_page, dest_page);
+ put_page(dest_page);
+ put_page(src_page);
+
+ if (!same)
+ break;
+
+ srcoff += cmp_len;
+ destoff += cmp_len;
+ len -= cmp_len;
+ }
+
+ *is_same = same;
+ return 0;
+
+out_error:
+ return error;
+}
+
+/*
+ * Check that the two inodes are eligible for cloning, the ranges make
+ * sense, and then flush all dirty data. Caller must ensure that the
+ * inodes have been locked against any other modifications.
+ *
+ * If there's an error, then the usual negative error code is returned.
+ * Otherwise returns 0 with *len set to the request length.
+ */
+int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *len, unsigned int remap_flags)
+{
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
+ bool same_inode = (inode_in == inode_out);
+ int ret;
+
+ /* Don't touch certain kinds of inodes */
+ if (IS_IMMUTABLE(inode_out))
+ return -EPERM;
+
+ if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
+ return -ETXTBSY;
+
+ /* Don't reflink dirs, pipes, sockets... */
+ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
+ return -EISDIR;
+ if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
+ return -EINVAL;
+
+ /* Zero length dedupe exits immediately; reflink goes to EOF. */
+ if (*len == 0) {
+ loff_t isize = i_size_read(inode_in);
+
+ if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
+ return 0;
+ if (pos_in > isize)
+ return -EINVAL;
+ *len = isize - pos_in;
+ if (*len == 0)
+ return 0;
+ }
+
+ /* Check that we don't violate system file offset limits. */
+ ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
+ remap_flags);
+ if (ret)
+ return ret;
+
+ /* Wait for the completion of any pending IOs on both files */
+ inode_dio_wait(inode_in);
+ if (!same_inode)
+ inode_dio_wait(inode_out);
+
+ ret = filemap_write_and_wait_range(inode_in->i_mapping,
+ pos_in, pos_in + *len - 1);
+ if (ret)
+ return ret;
+
+ ret = filemap_write_and_wait_range(inode_out->i_mapping,
+ pos_out, pos_out + *len - 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Check that the extents are the same.
+ */
+ if (remap_flags & REMAP_FILE_DEDUP) {
+ bool is_same = false;
+
+ ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
+ inode_out, pos_out, *len, &is_same);
+ if (ret)
+ return ret;
+ if (!is_same)
+ return -EBADE;
+ }
+
+ ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
+ remap_flags);
+ if (ret)
+ return ret;
+
+ /* If can't alter the file contents, we're done. */
+ if (!(remap_flags & REMAP_FILE_DEDUP))
+ ret = file_modified(file_out);
+
+ return ret;
+}
+EXPORT_SYMBOL(generic_remap_file_range_prep);
+
+loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t len, unsigned int remap_flags)
+{
+ loff_t ret;
+
+ WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP);
+
+ /*
+ * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
+ * the same mount. Practically, they only need to be on the same file
+ * system.
+ */
+ if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
+ return -EXDEV;
+
+ ret = generic_file_rw_checks(file_in, file_out);
+ if (ret < 0)
+ return ret;
+
+ if (!file_in->f_op->remap_file_range)
+ return -EOPNOTSUPP;
+
+ ret = remap_verify_area(file_in, pos_in, len, false);
+ if (ret)
+ return ret;
+
+ ret = remap_verify_area(file_out, pos_out, len, true);
+ if (ret)
+ return ret;
+
+ ret = file_in->f_op->remap_file_range(file_in, pos_in,
+ file_out, pos_out, len, remap_flags);
+ if (ret < 0)
+ return ret;
+
+ fsnotify_access(file_in);
+ fsnotify_modify(file_out);
+ return ret;
+}
+EXPORT_SYMBOL(do_clone_file_range);
+
+loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t len, unsigned int remap_flags)
+{
+ loff_t ret;
+
+ file_start_write(file_out);
+ ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len,
+ remap_flags);
+ file_end_write(file_out);
+
+ return ret;
+}
+EXPORT_SYMBOL(vfs_clone_file_range);
+
+/* Check whether we are allowed to dedupe the destination file */
+static bool allow_file_dedupe(struct file *file)
+{
+ if (capable(CAP_SYS_ADMIN))
+ return true;
+ if (file->f_mode & FMODE_WRITE)
+ return true;
+ if (uid_eq(current_fsuid(), file_inode(file)->i_uid))
+ return true;
+ if (!inode_permission(file_inode(file), MAY_WRITE))
+ return true;
+ return false;
+}
+
+loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
+ struct file *dst_file, loff_t dst_pos,
+ loff_t len, unsigned int remap_flags)
+{
+ loff_t ret;
+
+ WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP |
+ REMAP_FILE_CAN_SHORTEN));
+
+ ret = mnt_want_write_file(dst_file);
+ if (ret)
+ return ret;
+
+ ret = remap_verify_area(dst_file, dst_pos, len, true);
+ if (ret < 0)
+ goto out_drop_write;
+
+ ret = -EPERM;
+ if (!allow_file_dedupe(dst_file))
+ goto out_drop_write;
+
+ ret = -EXDEV;
+ if (src_file->f_path.mnt != dst_file->f_path.mnt)
+ goto out_drop_write;
+
+ ret = -EISDIR;
+ if (S_ISDIR(file_inode(dst_file)->i_mode))
+ goto out_drop_write;
+
+ ret = -EINVAL;
+ if (!dst_file->f_op->remap_file_range)
+ goto out_drop_write;
+
+ if (len == 0) {
+ ret = 0;
+ goto out_drop_write;
+ }
+
+ ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file,
+ dst_pos, len, remap_flags | REMAP_FILE_DEDUP);
+out_drop_write:
+ mnt_drop_write_file(dst_file);
+
+ return ret;
+}
+EXPORT_SYMBOL(vfs_dedupe_file_range_one);
+
+int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
+{
+ struct file_dedupe_range_info *info;
+ struct inode *src = file_inode(file);
+ u64 off;
+ u64 len;
+ int i;
+ int ret;
+ u16 count = same->dest_count;
+ loff_t deduped;
+
+ if (!(file->f_mode & FMODE_READ))
+ return -EINVAL;
+
+ if (same->reserved1 || same->reserved2)
+ return -EINVAL;
+
+ off = same->src_offset;
+ len = same->src_length;
+
+ if (S_ISDIR(src->i_mode))
+ return -EISDIR;
+
+ if (!S_ISREG(src->i_mode))
+ return -EINVAL;
+
+ if (!file->f_op->remap_file_range)
+ return -EOPNOTSUPP;
+
+ ret = remap_verify_area(file, off, len, false);
+ if (ret < 0)
+ return ret;
+ ret = 0;
+
+ if (off + len > i_size_read(src))
+ return -EINVAL;
+
+ /* Arbitrary 1G limit on a single dedupe request, can be raised. */
+ len = min_t(u64, len, 1 << 30);
+
+ /* pre-format output fields to sane values */
+ for (i = 0; i < count; i++) {
+ same->info[i].bytes_deduped = 0ULL;
+ same->info[i].status = FILE_DEDUPE_RANGE_SAME;
+ }
+
+ for (i = 0, info = same->info; i < count; i++, info++) {
+ struct fd dst_fd = fdget(info->dest_fd);
+ struct file *dst_file = dst_fd.file;
+
+ if (!dst_file) {
+ info->status = -EBADF;
+ goto next_loop;
+ }
+
+ if (info->reserved) {
+ info->status = -EINVAL;
+ goto next_fdput;
+ }
+
+ deduped = vfs_dedupe_file_range_one(file, off, dst_file,
+ info->dest_offset, len,
+ REMAP_FILE_CAN_SHORTEN);
+ if (deduped == -EBADE)
+ info->status = FILE_DEDUPE_RANGE_DIFFERS;
+ else if (deduped < 0)
+ info->status = deduped;
+ else
+ info->bytes_deduped = len;
+
+next_fdput:
+ fdput(dst_fd);
+next_loop:
+ if (fatal_signal_pending(current))
+ break;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(vfs_dedupe_file_range);
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index b1b7d3f5752f..259f684d9236 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -416,8 +416,7 @@ static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bfree = buf->f_bavail = buf->f_ffree;
buf->f_blocks =
(romfs_maxsize(dentry->d_sb) + ROMBSIZE - 1) >> ROMBSBITS;
- buf->f_fsid.val[0] = (u32)id;
- buf->f_fsid.val[1] = (u32)(id >> 32);
+ buf->f_fsid = u64_to_fsid(id);
return 0;
}
diff --git a/fs/splice.c b/fs/splice.c
index 70cc52af780b..866d5c2367b2 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -341,89 +341,6 @@ const struct pipe_buf_operations nosteal_pipe_buf_ops = {
};
EXPORT_SYMBOL(nosteal_pipe_buf_ops);
-static ssize_t kernel_readv(struct file *file, const struct kvec *vec,
- unsigned long vlen, loff_t offset)
-{
- mm_segment_t old_fs;
- loff_t pos = offset;
- ssize_t res;
-
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- /* The cast to a user pointer is valid due to the set_fs() */
- res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0);
- set_fs(old_fs);
-
- return res;
-}
-
-static ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
- struct pipe_inode_info *pipe, size_t len,
- unsigned int flags)
-{
- struct kvec *vec, __vec[PIPE_DEF_BUFFERS];
- struct iov_iter to;
- struct page **pages;
- unsigned int nr_pages;
- unsigned int mask;
- size_t offset, base, copied = 0;
- ssize_t res;
- int i;
-
- if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
- return -EAGAIN;
-
- /*
- * Try to keep page boundaries matching to source pagecache ones -
- * it probably won't be much help, but...
- */
- offset = *ppos & ~PAGE_MASK;
-
- iov_iter_pipe(&to, READ, pipe, len + offset);
-
- res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &base);
- if (res <= 0)
- return -ENOMEM;
-
- nr_pages = DIV_ROUND_UP(res + base, PAGE_SIZE);
-
- vec = __vec;
- if (nr_pages > PIPE_DEF_BUFFERS) {
- vec = kmalloc_array(nr_pages, sizeof(struct kvec), GFP_KERNEL);
- if (unlikely(!vec)) {
- res = -ENOMEM;
- goto out;
- }
- }
-
- mask = pipe->ring_size - 1;
- pipe->bufs[to.head & mask].offset = offset;
- pipe->bufs[to.head & mask].len -= offset;
-
- for (i = 0; i < nr_pages; i++) {
- size_t this_len = min_t(size_t, len, PAGE_SIZE - offset);
- vec[i].iov_base = page_address(pages[i]) + offset;
- vec[i].iov_len = this_len;
- len -= this_len;
- offset = 0;
- }
-
- res = kernel_readv(in, vec, nr_pages, *ppos);
- if (res > 0) {
- copied = res;
- *ppos += res;
- }
-
- if (vec != __vec)
- kfree(vec);
-out:
- for (i = 0; i < nr_pages; i++)
- put_page(pages[i]);
- kvfree(pages);
- iov_iter_advance(&to, copied); /* truncates and discards */
- return res;
-}
-
/*
* Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
* using sendpage(). Return the number of bytes sent.
@@ -807,33 +724,6 @@ done:
EXPORT_SYMBOL(iter_file_splice_write);
-static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
- struct splice_desc *sd)
-{
- int ret;
- void *data;
- loff_t tmp = sd->pos;
-
- data = kmap(buf->page);
- ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp);
- kunmap(buf->page);
-
- return ret;
-}
-
-static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
- struct file *out, loff_t *ppos,
- size_t len, unsigned int flags)
-{
- ssize_t ret;
-
- ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf);
- if (ret > 0)
- *ppos += ret;
-
- return ret;
-}
-
/**
* generic_splice_sendpage - splice data from a pipe to a socket
* @pipe: pipe to splice from
@@ -855,15 +745,23 @@ ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
EXPORT_SYMBOL(generic_splice_sendpage);
+static int warn_unsupported(struct file *file, const char *op)
+{
+ pr_debug_ratelimited(
+ "splice %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
+ op, file, current->pid, current->comm);
+ return -EINVAL;
+}
+
/*
* Attempt to initiate a splice from pipe to file.
*/
static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
loff_t *ppos, size_t len, unsigned int flags)
{
- if (out->f_op->splice_write)
- return out->f_op->splice_write(pipe, out, ppos, len, flags);
- return default_file_splice_write(pipe, out, ppos, len, flags);
+ if (unlikely(!out->f_op->splice_write))
+ return warn_unsupported(out, "write");
+ return out->f_op->splice_write(pipe, out, ppos, len, flags);
}
/*
@@ -885,9 +783,9 @@ static long do_splice_to(struct file *in, loff_t *ppos,
if (unlikely(len > MAX_RW_COUNT))
len = MAX_RW_COUNT;
- if (in->f_op->splice_read)
- return in->f_op->splice_read(in, ppos, pipe, len, flags);
- return default_file_splice_read(in, ppos, pipe, len, flags);
+ if (unlikely(!in->f_op->splice_read))
+ return warn_unsupported(in, "read");
+ return in->f_op->splice_read(in, ppos, pipe, len, flags);
}
/**
@@ -1107,9 +1005,8 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
/*
* Determine where to splice to/from.
*/
-long do_splice(struct file *in, loff_t __user *off_in,
- struct file *out, loff_t __user *off_out,
- size_t len, unsigned int flags)
+long do_splice(struct file *in, loff_t *off_in, struct file *out,
+ loff_t *off_out, size_t len, unsigned int flags)
{
struct pipe_inode_info *ipipe;
struct pipe_inode_info *opipe;
@@ -1143,8 +1040,7 @@ long do_splice(struct file *in, loff_t __user *off_in,
if (off_out) {
if (!(out->f_mode & FMODE_PWRITE))
return -EINVAL;
- if (copy_from_user(&offset, off_out, sizeof(loff_t)))
- return -EFAULT;
+ offset = *off_out;
} else {
offset = out->f_pos;
}
@@ -1165,8 +1061,8 @@ long do_splice(struct file *in, loff_t __user *off_in,
if (!off_out)
out->f_pos = offset;
- else if (copy_to_user(off_out, &offset, sizeof(loff_t)))
- ret = -EFAULT;
+ else
+ *off_out = offset;
return ret;
}
@@ -1177,8 +1073,7 @@ long do_splice(struct file *in, loff_t __user *off_in,
if (off_in) {
if (!(in->f_mode & FMODE_PREAD))
return -EINVAL;
- if (copy_from_user(&offset, off_in, sizeof(loff_t)))
- return -EFAULT;
+ offset = *off_in;
} else {
offset = in->f_pos;
}
@@ -1202,8 +1097,8 @@ long do_splice(struct file *in, loff_t __user *off_in,
wakeup_pipe_readers(opipe);
if (!off_in)
in->f_pos = offset;
- else if (copy_to_user(off_in, &offset, sizeof(loff_t)))
- ret = -EFAULT;
+ else
+ *off_in = offset;
return ret;
}
@@ -1211,6 +1106,46 @@ long do_splice(struct file *in, loff_t __user *off_in,
return -EINVAL;
}
+static long __do_splice(struct file *in, loff_t __user *off_in,
+ struct file *out, loff_t __user *off_out,
+ size_t len, unsigned int flags)
+{
+ struct pipe_inode_info *ipipe;
+ struct pipe_inode_info *opipe;
+ loff_t offset, *__off_in = NULL, *__off_out = NULL;
+ long ret;
+
+ ipipe = get_pipe_info(in, true);
+ opipe = get_pipe_info(out, true);
+
+ if (ipipe && off_in)
+ return -ESPIPE;
+ if (opipe && off_out)
+ return -ESPIPE;
+
+ if (off_out) {
+ if (copy_from_user(&offset, off_out, sizeof(loff_t)))
+ return -EFAULT;
+ __off_out = &offset;
+ }
+ if (off_in) {
+ if (copy_from_user(&offset, off_in, sizeof(loff_t)))
+ return -EFAULT;
+ __off_in = &offset;
+ }
+
+ ret = do_splice(in, __off_in, out, __off_out, len, flags);
+ if (ret < 0)
+ return ret;
+
+ if (__off_out && copy_to_user(off_out, __off_out, sizeof(loff_t)))
+ return -EFAULT;
+ if (__off_in && copy_to_user(off_in, __off_in, sizeof(loff_t)))
+ return -EFAULT;
+
+ return ret;
+}
+
static int iter_to_pipe(struct iov_iter *from,
struct pipe_inode_info *pipe,
unsigned flags)
@@ -1405,8 +1340,8 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
if (in.file) {
out = fdget(fd_out);
if (out.file) {
- error = do_splice(in.file, off_in, out.file, off_out,
- len, flags);
+ error = __do_splice(in.file, off_in, out.file, off_out,
+ len, flags);
fdput(out);
}
fdput(in);
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 0cc4ceec0562..d6c6593ec169 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -380,8 +380,7 @@ static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_files = msblk->inodes;
buf->f_ffree = 0;
buf->f_namelen = SQUASHFS_NAME_LEN;
- buf->f_fsid.val[0] = (u32)id;
- buf->f_fsid.val[1] = (u32)(id >> 32);
+ buf->f_fsid = u64_to_fsid(id);
return 0;
}
diff --git a/fs/stat.c b/fs/stat.c
index 44f8ad346db4..dacecdda2e79 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -56,7 +56,7 @@ EXPORT_SYMBOL(generic_fillattr);
* @path: file to get attributes from
* @stat: structure to return attributes in
* @request_mask: STATX_xxx flags indicating what the caller wants
- * @query_flags: Query mode (KSTAT_QUERY_FLAGS)
+ * @query_flags: Query mode (AT_STATX_SYNC_TYPE)
*
* Get attributes without calling security_inode_getattr.
*
@@ -71,7 +71,7 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
memset(stat, 0, sizeof(*stat));
stat->result_mask |= STATX_BASIC_STATS;
- query_flags &= KSTAT_QUERY_FLAGS;
+ query_flags &= AT_STATX_SYNC_TYPE;
/* allow the fs to override these if it really wants to */
/* SB_NOATIME means filesystem supplies dummy atime value */
@@ -97,7 +97,7 @@ EXPORT_SYMBOL(vfs_getattr_nosec);
* @path: The file of interest
* @stat: Where to return the statistics
* @request_mask: STATX_xxx flags indicating what the caller wants
- * @query_flags: Query mode (KSTAT_QUERY_FLAGS)
+ * @query_flags: Query mode (AT_STATX_SYNC_TYPE)
*
* Ask the filesystem for a file's attributes. The caller must indicate in
* request_mask and query_flags to indicate what they want.
@@ -126,53 +126,27 @@ int vfs_getattr(const struct path *path, struct kstat *stat,
EXPORT_SYMBOL(vfs_getattr);
/**
- * vfs_statx_fd - Get the enhanced basic attributes by file descriptor
+ * vfs_fstat - Get the basic attributes by file descriptor
* @fd: The file descriptor referring to the file of interest
* @stat: The result structure to fill in.
- * @request_mask: STATX_xxx flags indicating what the caller wants
- * @query_flags: Query mode (KSTAT_QUERY_FLAGS)
*
* This function is a wrapper around vfs_getattr(). The main difference is
* that it uses a file descriptor to determine the file location.
*
* 0 will be returned on success, and a -ve error code if unsuccessful.
*/
-int vfs_statx_fd(unsigned int fd, struct kstat *stat,
- u32 request_mask, unsigned int query_flags)
+int vfs_fstat(int fd, struct kstat *stat)
{
struct fd f;
- int error = -EBADF;
-
- if (query_flags & ~KSTAT_QUERY_FLAGS)
- return -EINVAL;
+ int error;
f = fdget_raw(fd);
- if (f.file) {
- error = vfs_getattr(&f.file->f_path, stat,
- request_mask, query_flags);
- fdput(f);
- }
+ if (!f.file)
+ return -EBADF;
+ error = vfs_getattr(&f.file->f_path, stat, STATX_BASIC_STATS, 0);
+ fdput(f);
return error;
}
-EXPORT_SYMBOL(vfs_statx_fd);
-
-static inline unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags,
- int flags)
-{
- if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
- AT_EMPTY_PATH | KSTAT_QUERY_FLAGS)) != 0)
- return -EINVAL;
-
- *lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT;
- if (flags & AT_SYMLINK_NOFOLLOW)
- *lookup_flags &= ~LOOKUP_FOLLOW;
- if (flags & AT_NO_AUTOMOUNT)
- *lookup_flags &= ~LOOKUP_AUTOMOUNT;
- if (flags & AT_EMPTY_PATH)
- *lookup_flags |= LOOKUP_EMPTY;
-
- return 0;
-}
/**
* vfs_statx - Get basic and extra attributes by filename
@@ -189,15 +163,24 @@ static inline unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags,
*
* 0 will be returned on success, and a -ve error code if unsuccessful.
*/
-int vfs_statx(int dfd, const char __user *filename, int flags,
+static int vfs_statx(int dfd, const char __user *filename, int flags,
struct kstat *stat, u32 request_mask)
{
struct path path;
- int error = -EINVAL;
- unsigned lookup_flags;
+ unsigned lookup_flags = 0;
+ int error;
- if (vfs_stat_set_lookup_flags(&lookup_flags, flags))
+ if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | AT_EMPTY_PATH |
+ AT_STATX_SYNC_TYPE))
return -EINVAL;
+
+ if (!(flags & AT_SYMLINK_NOFOLLOW))
+ lookup_flags |= LOOKUP_FOLLOW;
+ if (!(flags & AT_NO_AUTOMOUNT))
+ lookup_flags |= LOOKUP_AUTOMOUNT;
+ if (flags & AT_EMPTY_PATH)
+ lookup_flags |= LOOKUP_EMPTY;
+
retry:
error = user_path_at(dfd, filename, lookup_flags, &path);
if (error)
@@ -217,8 +200,13 @@ retry:
out:
return error;
}
-EXPORT_SYMBOL(vfs_statx);
+int vfs_fstatat(int dfd, const char __user *filename,
+ struct kstat *stat, int flags)
+{
+ return vfs_statx(dfd, filename, flags | AT_NO_AUTOMOUNT,
+ stat, STATX_BASIC_STATS);
+}
#ifdef __ARCH_WANT_OLD_STAT
diff --git a/fs/statfs.c b/fs/statfs.c
index 2616424012ea..59f33752c131 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -29,6 +29,8 @@ static int flags_by_mnt(int mnt_flags)
flags |= ST_NODIRATIME;
if (mnt_flags & MNT_RELATIME)
flags |= ST_RELATIME;
+ if (mnt_flags & MNT_NOSYMFOLLOW)
+ flags |= ST_NOSYMFOLLOW;
return flags;
}
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 02b1d9d0c182..be47263b8605 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -98,8 +98,7 @@ static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_files = sbi->s_ninodes;
buf->f_ffree = sysv_count_free_inodes(sb);
buf->f_namelen = SYSV_NAMELEN;
- buf->f_fsid.val[0] = (u32)id;
- buf->f_fsid.val[1] = (u32)(id >> 32);
+ buf->f_fsid = u64_to_fsid(id);
return 0;
}
diff --git a/fs/udf/super.c b/fs/udf/super.c
index faf2017ada11..5bef3a68395d 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -2414,8 +2414,7 @@ static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
+ buf->f_bfree;
buf->f_ffree = buf->f_bfree;
buf->f_namelen = UDF_NAME_LEN;
- buf->f_fsid.val[0] = (u32)id;
- buf->f_fsid.val[1] = (u32)(id >> 32);
+ buf->f_fsid = u64_to_fsid(id);
return 0;
}
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index e3b69fb280e8..983558b572c7 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1431,8 +1431,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
? (buf->f_bfree - uspi->s_root_blocks) : 0;
buf->f_files = uspi->s_ncg * uspi->s_ipg;
buf->f_namelen = UFS_MAXNAMLEN;
- buf->f_fsid.val[0] = (u32)id;
- buf->f_fsid.val[1] = (u32)(id >> 32);
+ buf->f_fsid = u64_to_fsid(id);
mutex_unlock(&UFS_SB(sb)->s_lock);
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index e685299eb3d2..9fac5ea8d0e4 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -22,6 +22,31 @@ config XFS_FS
system of your root partition is compiled as a module, you'll need
to use an initial ramdisk (initrd) to boot.
+config XFS_SUPPORT_V4
+ bool "Support deprecated V4 (crc=0) format"
+ depends on XFS_FS
+ default y
+ help
+ The V4 filesystem format lacks certain features that are supported
+ by the V5 format, such as metadata checksumming, strengthened
+ metadata verification, and the ability to store timestamps past the
+ year 2038. Because of this, the V4 format is deprecated. All users
+ should upgrade by backing up their files, reformatting, and restoring
+ from the backup.
+
+ Administrators and users can detect a V4 filesystem by running
+ xfs_info against a filesystem mountpoint and checking for a string
+ beginning with "crc=". If the string "crc=0" is found, the
+ filesystem is a V4 filesystem. If no such string is found, please
+ upgrade xfsprogs to the latest version and try again.
+
+ This option will become default N in September 2025. Support for the
+ V4 format will be removed entirely in September 2030. Distributors
+ can say N here to withdraw support earlier.
+
+ To continue supporting the old V4 format (crc=0), say Y.
+ To close off an attack surface, say N.
+
config XFS_QUOTA
bool "XFS Quota support"
depends on XFS_FS
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 3f80cede7406..48d8e9caf86f 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -96,8 +96,6 @@ xfs_attr3_rmt_verify(
{
struct xfs_attr3_rmt_hdr *rmt = ptr;
- if (!xfs_sb_version_hascrc(&mp->m_sb))
- return __this_address;
if (!xfs_verify_magic(bp, rmt->rm_magic))
return __this_address;
if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid))
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 1b0a01b06a05..d9a692484eae 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -5046,20 +5046,25 @@ xfs_bmap_del_extent_real(
flags = XFS_ILOG_CORE;
if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
- xfs_fsblock_t bno;
xfs_filblks_t len;
xfs_extlen_t mod;
- bno = div_u64_rem(del->br_startblock, mp->m_sb.sb_rextsize,
- &mod);
- ASSERT(mod == 0);
len = div_u64_rem(del->br_blockcount, mp->m_sb.sb_rextsize,
&mod);
ASSERT(mod == 0);
- error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len);
- if (error)
- goto done;
+ if (!(bflags & XFS_BMAPI_REMAP)) {
+ xfs_fsblock_t bno;
+
+ bno = div_u64_rem(del->br_startblock,
+ mp->m_sb.sb_rextsize, &mod);
+ ASSERT(mod == 0);
+
+ error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len);
+ if (error)
+ goto done;
+ }
+
do_fx = 0;
nblks = len * mp->m_sb.sb_rextsize;
qfield = XFS_TRANS_DQ_RTBCOUNT;
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index b40a4e80f5ee..b876b44c0204 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -15,8 +15,8 @@
*/
#define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */
#define XFS_ATTR_LEAF_MAGIC 0xfbee /* magic number: attribute leaf blks */
-#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */
-#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */
+#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */
+#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */
typedef struct xfs_da_blkinfo {
__be32 forw; /* previous block in list */
@@ -35,8 +35,8 @@ typedef struct xfs_da_blkinfo {
*/
#define XFS_DA3_NODE_MAGIC 0x3ebe /* magic number: non-leaf blocks */
#define XFS_ATTR3_LEAF_MAGIC 0x3bee /* magic number: attribute leaf blks */
-#define XFS_DIR3_LEAF1_MAGIC 0x3df1 /* magic number: v2 dirlf single blks */
-#define XFS_DIR3_LEAFN_MAGIC 0x3dff /* magic number: v2 dirlf multi blks */
+#define XFS_DIR3_LEAF1_MAGIC 0x3df1 /* magic number: v3 dirlf single blks */
+#define XFS_DIR3_LEAFN_MAGIC 0x3dff /* magic number: v3 dirlf multi blks */
struct xfs_da3_blkinfo {
/*
@@ -61,7 +61,7 @@ struct xfs_da3_blkinfo {
* Since we have duplicate keys, use a binary search but always follow
* all match in the block, not just the first match found.
*/
-#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */
+#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */
typedef struct xfs_da_node_hdr {
struct xfs_da_blkinfo info; /* block type, links, etc. */
@@ -746,14 +746,14 @@ xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
*/
static inline int xfs_attr_leaf_entsize_remote(int nlen)
{
- return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \
- XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
+ return round_up(sizeof(struct xfs_attr_leaf_name_remote) - 1 +
+ nlen, XFS_ATTR_LEAF_NAME_ALIGN);
}
static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen)
{
- return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) +
- XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
+ return round_up(sizeof(struct xfs_attr_leaf_name_local) - 1 +
+ nlen + vlen, XFS_ATTR_LEAF_NAME_ALIGN);
}
static inline int xfs_attr_leaf_entsize_local_max(int bsize)
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index d8f586256add..eff4a127188e 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -16,6 +16,8 @@
#include "xfs_inode.h"
#include "xfs_inode_item.h"
#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_log.h"
/*
* Deferred Operations in XFS
@@ -186,8 +188,9 @@ xfs_defer_create_intent(
{
const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type];
- dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work,
- dfp->dfp_count, sort);
+ if (!dfp->dfp_intent)
+ dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work,
+ dfp->dfp_count, sort);
}
/*
@@ -312,22 +315,6 @@ xfs_defer_trans_roll(
}
/*
- * Reset an already used dfops after finish.
- */
-static void
-xfs_defer_reset(
- struct xfs_trans *tp)
-{
- ASSERT(list_empty(&tp->t_dfops));
-
- /*
- * Low mode state transfers across transaction rolls to mirror dfops
- * lifetime. Clear it now that dfops is reset.
- */
- tp->t_flags &= ~XFS_TRANS_LOWMODE;
-}
-
-/*
* Free up any items left in the list.
*/
static void
@@ -360,6 +347,58 @@ xfs_defer_cancel_list(
}
/*
+ * Prevent a log intent item from pinning the tail of the log by logging a
+ * done item to release the intent item; and then log a new intent item.
+ * The caller should provide a fresh transaction and roll it after we're done.
+ */
+static int
+xfs_defer_relog(
+ struct xfs_trans **tpp,
+ struct list_head *dfops)
+{
+ struct xlog *log = (*tpp)->t_mountp->m_log;
+ struct xfs_defer_pending *dfp;
+ xfs_lsn_t threshold_lsn = NULLCOMMITLSN;
+
+
+ ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+
+ list_for_each_entry(dfp, dfops, dfp_list) {
+ /*
+ * If the log intent item for this deferred op is not a part of
+ * the current log checkpoint, relog the intent item to keep
+ * the log tail moving forward. We're ok with this being racy
+ * because an incorrect decision means we'll be a little slower
+ * at pushing the tail.
+ */
+ if (dfp->dfp_intent == NULL ||
+ xfs_log_item_in_current_chkpt(dfp->dfp_intent))
+ continue;
+
+ /*
+ * Figure out where we need the tail to be in order to maintain
+ * the minimum required free space in the log. Only sample
+ * the log threshold once per call.
+ */
+ if (threshold_lsn == NULLCOMMITLSN) {
+ threshold_lsn = xlog_grant_push_threshold(log, 0);
+ if (threshold_lsn == NULLCOMMITLSN)
+ break;
+ }
+ if (XFS_LSN_CMP(dfp->dfp_intent->li_lsn, threshold_lsn) >= 0)
+ continue;
+
+ trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp);
+ XFS_STATS_INC((*tpp)->t_mountp, defer_relog);
+ dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, *tpp);
+ }
+
+ if ((*tpp)->t_flags & XFS_TRANS_DIRTY)
+ return xfs_defer_trans_roll(tpp);
+ return 0;
+}
+
+/*
* Log an intent-done item for the first pending intent, and finish the work
* items.
*/
@@ -390,6 +429,7 @@ xfs_defer_finish_one(
list_add(li, &dfp->dfp_work);
dfp->dfp_count++;
dfp->dfp_done = NULL;
+ dfp->dfp_intent = NULL;
xfs_defer_create_intent(tp, dfp, false);
}
@@ -428,13 +468,27 @@ xfs_defer_finish_noroll(
/* Until we run out of pending work to finish... */
while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) {
+ /*
+ * Deferred items that are created in the process of finishing
+ * other deferred work items should be queued at the head of
+ * the pending list, which puts them ahead of the deferred work
+ * that was created by the caller. This keeps the number of
+ * pending work items to a minimum, which decreases the amount
+ * of time that any one intent item can stick around in memory,
+ * pinning the log tail.
+ */
xfs_defer_create_intents(*tp);
- list_splice_tail_init(&(*tp)->t_dfops, &dop_pending);
+ list_splice_init(&(*tp)->t_dfops, &dop_pending);
error = xfs_defer_trans_roll(tp);
if (error)
goto out_shutdown;
+ /* Possibly relog intent items to keep the log moving. */
+ error = xfs_defer_relog(tp, &dop_pending);
+ if (error)
+ goto out_shutdown;
+
dfp = list_first_entry(&dop_pending, struct xfs_defer_pending,
dfp_list);
error = xfs_defer_finish_one(*tp, dfp);
@@ -475,7 +529,10 @@ xfs_defer_finish(
return error;
}
}
- xfs_defer_reset(*tp);
+
+ /* Reset LOWMODE now that we've finished all the dfops. */
+ ASSERT(list_empty(&(*tp)->t_dfops));
+ (*tp)->t_flags &= ~XFS_TRANS_LOWMODE;
return 0;
}
@@ -549,6 +606,139 @@ xfs_defer_move(
* that behavior.
*/
dtp->t_flags |= (stp->t_flags & XFS_TRANS_LOWMODE);
+ stp->t_flags &= ~XFS_TRANS_LOWMODE;
+}
+
+/*
+ * Prepare a chain of fresh deferred ops work items to be completed later. Log
+ * recovery requires the ability to put off until later the actual finishing
+ * work so that it can process unfinished items recovered from the log in
+ * correct order.
+ *
+ * Create and log intent items for all the work that we're capturing so that we
+ * can be assured that the items will get replayed if the system goes down
+ * before log recovery gets a chance to finish the work it put off. The entire
+ * deferred ops state is transferred to the capture structure and the
+ * transaction is then ready for the caller to commit it. If there are no
+ * intent items to capture, this function returns NULL.
+ *
+ * If capture_ip is not NULL, the capture structure will obtain an extra
+ * reference to the inode.
+ */
+static struct xfs_defer_capture *
+xfs_defer_ops_capture(
+ struct xfs_trans *tp,
+ struct xfs_inode *capture_ip)
+{
+ struct xfs_defer_capture *dfc;
+
+ if (list_empty(&tp->t_dfops))
+ return NULL;
+
+ /* Create an object to capture the defer ops. */
+ dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS);
+ INIT_LIST_HEAD(&dfc->dfc_list);
+ INIT_LIST_HEAD(&dfc->dfc_dfops);
+
+ xfs_defer_create_intents(tp);
+
+ /* Move the dfops chain and transaction state to the capture struct. */
+ list_splice_init(&tp->t_dfops, &dfc->dfc_dfops);
+ dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE;
+ tp->t_flags &= ~XFS_TRANS_LOWMODE;
+
+ /* Capture the remaining block reservations along with the dfops. */
+ dfc->dfc_blkres = tp->t_blk_res - tp->t_blk_res_used;
+ dfc->dfc_rtxres = tp->t_rtx_res - tp->t_rtx_res_used;
+
+ /* Preserve the log reservation size. */
+ dfc->dfc_logres = tp->t_log_res;
+
+ /*
+ * Grab an extra reference to this inode and attach it to the capture
+ * structure.
+ */
+ if (capture_ip) {
+ ihold(VFS_I(capture_ip));
+ dfc->dfc_capture_ip = capture_ip;
+ }
+
+ return dfc;
+}
+
+/* Release all resources that we used to capture deferred ops. */
+void
+xfs_defer_ops_release(
+ struct xfs_mount *mp,
+ struct xfs_defer_capture *dfc)
+{
+ xfs_defer_cancel_list(mp, &dfc->dfc_dfops);
+ if (dfc->dfc_capture_ip)
+ xfs_irele(dfc->dfc_capture_ip);
+ kmem_free(dfc);
+}
+
+/*
+ * Capture any deferred ops and commit the transaction. This is the last step
+ * needed to finish a log intent item that we recovered from the log. If any
+ * of the deferred ops operate on an inode, the caller must pass in that inode
+ * so that the reference can be transferred to the capture structure. The
+ * caller must hold ILOCK_EXCL on the inode, and must unlock it before calling
+ * xfs_defer_ops_continue.
+ */
+int
+xfs_defer_ops_capture_and_commit(
+ struct xfs_trans *tp,
+ struct xfs_inode *capture_ip,
+ struct list_head *capture_list)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_defer_capture *dfc;
+ int error;
+
+ ASSERT(!capture_ip || xfs_isilocked(capture_ip, XFS_ILOCK_EXCL));
+
+ /* If we don't capture anything, commit transaction and exit. */
+ dfc = xfs_defer_ops_capture(tp, capture_ip);
+ if (!dfc)
+ return xfs_trans_commit(tp);
+
+ /* Commit the transaction and add the capture structure to the list. */
+ error = xfs_trans_commit(tp);
+ if (error) {
+ xfs_defer_ops_release(mp, dfc);
+ return error;
+ }
+
+ list_add_tail(&dfc->dfc_list, capture_list);
+ return 0;
+}
+
+/*
+ * Attach a chain of captured deferred ops to a new transaction and free the
+ * capture structure. If an inode was captured, it will be passed back to the
+ * caller with ILOCK_EXCL held and joined to the transaction with lockflags==0.
+ * The caller now owns the inode reference.
+ */
+void
+xfs_defer_ops_continue(
+ struct xfs_defer_capture *dfc,
+ struct xfs_trans *tp,
+ struct xfs_inode **captured_ipp)
+{
+ ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+ ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY));
+
+ /* Lock and join the captured inode to the new transaction. */
+ if (dfc->dfc_capture_ip) {
+ xfs_ilock(dfc->dfc_capture_ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, dfc->dfc_capture_ip, 0);
+ }
+ *captured_ipp = dfc->dfc_capture_ip;
+
+ /* Move captured dfops chain and state to the transaction. */
+ list_splice_init(&dfc->dfc_dfops, &tp->t_dfops);
+ tp->t_flags |= dfc->dfc_tpflags;
- xfs_defer_reset(stp);
+ kmem_free(dfc);
}
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 6b2ca580f2b0..05472f71fffe 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -8,6 +8,7 @@
struct xfs_btree_cur;
struct xfs_defer_op_type;
+struct xfs_defer_capture;
/*
* Header for deferred operation list.
@@ -63,4 +64,40 @@ extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
+/*
+ * This structure enables a dfops user to detach the chain of deferred
+ * operations from a transaction so that they can be continued later.
+ */
+struct xfs_defer_capture {
+ /* List of other capture structures. */
+ struct list_head dfc_list;
+
+ /* Deferred ops state saved from the transaction. */
+ struct list_head dfc_dfops;
+ unsigned int dfc_tpflags;
+
+ /* Block reservations for the data and rt devices. */
+ unsigned int dfc_blkres;
+ unsigned int dfc_rtxres;
+
+ /* Log reservation saved from the transaction. */
+ unsigned int dfc_logres;
+
+ /*
+ * An inode reference that must be maintained to complete the deferred
+ * work.
+ */
+ struct xfs_inode *dfc_capture_ip;
+};
+
+/*
+ * Functions to capture a chain of deferred operations and continue them later.
+ * This doesn't normally happen except log recovery.
+ */
+int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp,
+ struct xfs_inode *capture_ip, struct list_head *capture_list);
+void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp,
+ struct xfs_inode **captured_ipp);
+void xfs_defer_ops_release(struct xfs_mount *mp, struct xfs_defer_capture *d);
+
#endif /* __XFS_DEFER_H__ */
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 536666143fe7..ef5eaf33d146 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -17,7 +17,7 @@ struct xfs_dinode;
*/
struct xfs_icdinode {
uint16_t di_flushiter; /* incremented on flush */
- uint32_t di_projid; /* owner's project id */
+ prid_t di_projid; /* owner's project id */
xfs_fsize_t di_size; /* number of bytes in file */
xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */
xfs_extlen_t di_extsize; /* basic/minimum extent size for file */
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 27c39268c31f..340c83f76c80 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -2505,12 +2505,15 @@ xfs_rmap_map_extent(
int whichfork,
struct xfs_bmbt_irec *PREV)
{
+ enum xfs_rmap_intent_type type = XFS_RMAP_MAP;
+
if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
return;
- __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
- XFS_RMAP_MAP_SHARED : XFS_RMAP_MAP, ip->i_ino,
- whichfork, PREV);
+ if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
+ type = XFS_RMAP_MAP_SHARED;
+
+ __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
}
/* Unmap an extent out of a file. */
@@ -2521,12 +2524,15 @@ xfs_rmap_unmap_extent(
int whichfork,
struct xfs_bmbt_irec *PREV)
{
+ enum xfs_rmap_intent_type type = XFS_RMAP_UNMAP;
+
if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
return;
- __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
- XFS_RMAP_UNMAP_SHARED : XFS_RMAP_UNMAP, ip->i_ino,
- whichfork, PREV);
+ if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
+ type = XFS_RMAP_UNMAP_SHARED;
+
+ __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
}
/*
@@ -2543,12 +2549,15 @@ xfs_rmap_convert_extent(
int whichfork,
struct xfs_bmbt_irec *PREV)
{
+ enum xfs_rmap_intent_type type = XFS_RMAP_CONVERT;
+
if (!xfs_rmap_update_is_needed(mp, whichfork))
return;
- __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
- XFS_RMAP_CONVERT_SHARED : XFS_RMAP_CONVERT, ip->i_ino,
- whichfork, PREV);
+ if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
+ type = XFS_RMAP_CONVERT_SHARED;
+
+ __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
}
/* Schedule the creation of an rmap for non-file data. */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 1d9fa8a300f1..6c1aba16113c 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1018,7 +1018,6 @@ xfs_rtalloc_query_range(
struct xfs_mount *mp = tp->t_mountp;
xfs_rtblock_t rtstart;
xfs_rtblock_t rtend;
- xfs_rtblock_t rem;
int is_free;
int error = 0;
@@ -1027,13 +1026,12 @@ xfs_rtalloc_query_range(
if (low_rec->ar_startext >= mp->m_sb.sb_rextents ||
low_rec->ar_startext == high_rec->ar_startext)
return 0;
- if (high_rec->ar_startext > mp->m_sb.sb_rextents)
- high_rec->ar_startext = mp->m_sb.sb_rextents;
+ high_rec->ar_startext = min(high_rec->ar_startext,
+ mp->m_sb.sb_rextents - 1);
/* Iterate the bitmap, looking for discrepancies. */
rtstart = low_rec->ar_startext;
- rem = high_rec->ar_startext - rtstart;
- while (rem) {
+ while (rtstart <= high_rec->ar_startext) {
/* Is the first block free? */
error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend,
&is_free);
@@ -1042,7 +1040,7 @@ xfs_rtalloc_query_range(
/* How long does the extent go for? */
error = xfs_rtfind_forw(mp, tp, rtstart,
- high_rec->ar_startext - 1, &rtend);
+ high_rec->ar_startext, &rtend);
if (error)
break;
@@ -1055,7 +1053,6 @@ xfs_rtalloc_query_range(
break;
}
- rem -= rtend - rtstart + 1;
rtstart = rtend + 1;
}
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index e56786f0a13c..653f3280e1c1 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -441,6 +441,20 @@ xchk_da_btree_block(
goto out_freebp;
}
+ /*
+ * If we've been handed a block that is below the dabtree root, does
+ * its hashval match what the parent block expected to see?
+ */
+ if (level > 0) {
+ struct xfs_da_node_entry *key;
+
+ key = xchk_da_btree_node_entry(ds, level - 1);
+ if (be32_to_cpu(key->hashval) != blk->hashval) {
+ xchk_da_set_corrupt(ds, level);
+ goto out_freebp;
+ }
+ }
+
out:
return error;
out_freebp:
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index ec3691372e7c..9e16a4d0f97c 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -24,6 +24,7 @@
#include "xfs_error.h"
#include "xfs_log_priv.h"
#include "xfs_log_recover.h"
+#include "xfs_quota.h"
kmem_zone_t *xfs_bui_zone;
kmem_zone_t *xfs_bud_zone;
@@ -423,30 +424,26 @@ const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
STATIC int
xfs_bui_item_recover(
struct xfs_log_item *lip,
- struct xfs_trans *parent_tp)
+ struct list_head *capture_list)
{
struct xfs_bmbt_irec irec;
struct xfs_bui_log_item *buip = BUI_ITEM(lip);
struct xfs_trans *tp;
struct xfs_inode *ip = NULL;
- struct xfs_mount *mp = parent_tp->t_mountp;
+ struct xfs_mount *mp = lip->li_mountp;
struct xfs_map_extent *bmap;
struct xfs_bud_log_item *budp;
xfs_fsblock_t startblock_fsb;
xfs_fsblock_t inode_fsb;
xfs_filblks_t count;
xfs_exntst_t state;
- enum xfs_bmap_intent_type type;
- bool op_ok;
unsigned int bui_type;
int whichfork;
int error = 0;
/* Only one mapping operation per BUI... */
- if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) {
- xfs_bui_release(buip);
+ if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS)
return -EFSCORRUPTED;
- }
/*
* First check the validity of the extent described by the
@@ -457,76 +454,58 @@ xfs_bui_item_recover(
XFS_FSB_TO_DADDR(mp, bmap->me_startblock));
inode_fsb = XFS_BB_TO_FSB(mp, XFS_FSB_TO_DADDR(mp,
XFS_INO_TO_FSB(mp, bmap->me_owner)));
- switch (bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) {
+ state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
+ XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
+ whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
+ XFS_ATTR_FORK : XFS_DATA_FORK;
+ bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
+ switch (bui_type) {
case XFS_BMAP_MAP:
case XFS_BMAP_UNMAP:
- op_ok = true;
break;
default:
- op_ok = false;
- break;
+ return -EFSCORRUPTED;
}
- if (!op_ok || startblock_fsb == 0 ||
+ if (startblock_fsb == 0 ||
bmap->me_len == 0 ||
inode_fsb == 0 ||
startblock_fsb >= mp->m_sb.sb_dblocks ||
bmap->me_len >= mp->m_sb.sb_agblocks ||
inode_fsb >= mp->m_sb.sb_dblocks ||
- (bmap->me_flags & ~XFS_BMAP_EXTENT_FLAGS)) {
- /*
- * This will pull the BUI from the AIL and
- * free the memory associated with it.
- */
- xfs_bui_release(buip);
+ (bmap->me_flags & ~XFS_BMAP_EXTENT_FLAGS))
return -EFSCORRUPTED;
- }
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
- XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp);
+ /* Grab the inode. */
+ error = xfs_iget(mp, NULL, bmap->me_owner, 0, 0, &ip);
if (error)
return error;
- /*
- * Recovery stashes all deferred ops during intent processing and
- * finishes them on completion. Transfer current dfops state to this
- * transaction and transfer the result back before we return.
- */
- xfs_defer_move(tp, parent_tp);
- budp = xfs_trans_get_bud(tp, buip);
- /* Grab the inode. */
- error = xfs_iget(mp, tp, bmap->me_owner, 0, XFS_ILOCK_EXCL, &ip);
+ error = xfs_qm_dqattach(ip);
if (error)
- goto err_inode;
+ goto err_rele;
if (VFS_I(ip)->i_nlink == 0)
xfs_iflags_set(ip, XFS_IRECOVERY);
- /* Process deferred bmap item. */
- state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
- XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
- whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
- XFS_ATTR_FORK : XFS_DATA_FORK;
- bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
- switch (bui_type) {
- case XFS_BMAP_MAP:
- case XFS_BMAP_UNMAP:
- type = bui_type;
- break;
- default:
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
- error = -EFSCORRUPTED;
- goto err_inode;
- }
+ /* Allocate transaction and do the work. */
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
+ XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp);
+ if (error)
+ goto err_rele;
+
+ budp = xfs_trans_get_bud(tp, buip);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
count = bmap->me_len;
- error = xfs_trans_log_finish_bmap_update(tp, budp, type, ip, whichfork,
- bmap->me_startoff, bmap->me_startblock, &count, state);
+ error = xfs_trans_log_finish_bmap_update(tp, budp, bui_type, ip,
+ whichfork, bmap->me_startoff, bmap->me_startblock,
+ &count, state);
if (error)
- goto err_inode;
+ goto err_cancel;
if (count > 0) {
- ASSERT(type == XFS_BMAP_UNMAP);
+ ASSERT(bui_type == XFS_BMAP_UNMAP);
irec.br_startblock = bmap->me_startblock;
irec.br_blockcount = count;
irec.br_startoff = bmap->me_startoff;
@@ -534,20 +513,24 @@ xfs_bui_item_recover(
xfs_bmap_unmap_extent(tp, ip, &irec);
}
- xfs_defer_move(parent_tp, tp);
- error = xfs_trans_commit(tp);
+ /*
+ * Commit transaction, which frees the transaction and saves the inode
+ * for later replay activities.
+ */
+ error = xfs_defer_ops_capture_and_commit(tp, ip, capture_list);
+ if (error)
+ goto err_unlock;
+
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_irele(ip);
+ return 0;
- return error;
-
-err_inode:
- xfs_defer_move(parent_tp, tp);
+err_cancel:
xfs_trans_cancel(tp);
- if (ip) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_irele(ip);
- }
+err_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+err_rele:
+ xfs_irele(ip);
return error;
}
@@ -559,6 +542,32 @@ xfs_bui_item_match(
return BUI_ITEM(lip)->bui_format.bui_id == intent_id;
}
+/* Relog an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_bui_item_relog(
+ struct xfs_log_item *intent,
+ struct xfs_trans *tp)
+{
+ struct xfs_bud_log_item *budp;
+ struct xfs_bui_log_item *buip;
+ struct xfs_map_extent *extp;
+ unsigned int count;
+
+ count = BUI_ITEM(intent)->bui_format.bui_nextents;
+ extp = BUI_ITEM(intent)->bui_format.bui_extents;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ budp = xfs_trans_get_bud(tp, BUI_ITEM(intent));
+ set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
+
+ buip = xfs_bui_init(tp->t_mountp);
+ memcpy(buip->bui_format.bui_extents, extp, count * sizeof(*extp));
+ atomic_set(&buip->bui_next_extent, count);
+ xfs_trans_add_item(tp, &buip->bui_item);
+ set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
+ return &buip->bui_item;
+}
+
static const struct xfs_item_ops xfs_bui_item_ops = {
.iop_size = xfs_bui_item_size,
.iop_format = xfs_bui_item_format,
@@ -566,6 +575,7 @@ static const struct xfs_item_ops xfs_bui_item_ops = {
.iop_release = xfs_bui_item_release,
.iop_recover = xfs_bui_item_recover,
.iop_match = xfs_bui_item_match,
+ .iop_relog = xfs_bui_item_relog,
};
/*
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index f2a8a0e75e1f..7371a7f7c652 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -947,11 +947,11 @@ xfs_free_file_space(
endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
/* We can only free complete realtime extents. */
- if (XFS_IS_REALTIME_INODE(ip)) {
- xfs_extlen_t extsz = xfs_get_extsz_hint(ip);
-
- if ((startoffset_fsb | endoffset_fsb) & (extsz - 1))
- return -EINVAL;
+ if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) {
+ startoffset_fsb = roundup_64(startoffset_fsb,
+ mp->m_sb.sb_rextsize);
+ endoffset_fsb = rounddown_64(endoffset_fsb,
+ mp->m_sb.sb_rextsize);
}
/*
@@ -1147,14 +1147,6 @@ xfs_insert_file_space(
trace_xfs_insert_file_space(ip);
- /* We can only insert complete realtime extents. */
- if (XFS_IS_REALTIME_INODE(ip)) {
- xfs_extlen_t extsz = xfs_get_extsz_hint(ip);
-
- if ((stop_fsb | shift_fsb) & (extsz - 1))
- return -EINVAL;
- }
-
error = xfs_bmap_can_insert_extents(ip, stop_fsb, shift_fsb);
if (error)
return error;
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index 24c7a8d11e1a..d44e8b4a3391 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -719,6 +719,8 @@ xlog_recover_get_buf_lsn(
case XFS_ABTC_MAGIC:
case XFS_RMAP_CRC_MAGIC:
case XFS_REFC_CRC_MAGIC:
+ case XFS_FIBT_CRC_MAGIC:
+ case XFS_FIBT_MAGIC:
case XFS_IBT_CRC_MAGIC:
case XFS_IBT_MAGIC: {
struct xfs_btree_block *btb = blk;
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 3072814e407d..1d95ed387d66 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -831,8 +831,8 @@ xfs_qm_dqget_checks(
}
/*
- * Given the file system, id, and type (UDQUOT/GDQUOT), return a locked
- * dquot, doing an allocation (if requested) as needed.
+ * Given the file system, id, and type (UDQUOT/GDQUOT/PDQUOT), return a
+ * locked dquot, doing an allocation (if requested) as needed.
*/
int
xfs_qm_dqget(
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 6cb8cd11072a..6c11bfc3d452 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -585,10 +585,10 @@ const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
STATIC int
xfs_efi_item_recover(
struct xfs_log_item *lip,
- struct xfs_trans *parent_tp)
+ struct list_head *capture_list)
{
struct xfs_efi_log_item *efip = EFI_ITEM(lip);
- struct xfs_mount *mp = parent_tp->t_mountp;
+ struct xfs_mount *mp = lip->li_mountp;
struct xfs_efd_log_item *efdp;
struct xfs_trans *tp;
struct xfs_extent *extp;
@@ -608,14 +608,8 @@ xfs_efi_item_recover(
if (startblock_fsb == 0 ||
extp->ext_len == 0 ||
startblock_fsb >= mp->m_sb.sb_dblocks ||
- extp->ext_len >= mp->m_sb.sb_agblocks) {
- /*
- * This will pull the EFI from the AIL and
- * free the memory associated with it.
- */
- xfs_efi_release(efip);
+ extp->ext_len >= mp->m_sb.sb_agblocks)
return -EFSCORRUPTED;
- }
}
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
@@ -633,8 +627,7 @@ xfs_efi_item_recover(
}
- error = xfs_trans_commit(tp);
- return error;
+ return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
abort_error:
xfs_trans_cancel(tp);
@@ -649,6 +642,34 @@ xfs_efi_item_match(
return EFI_ITEM(lip)->efi_format.efi_id == intent_id;
}
+/* Relog an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_efi_item_relog(
+ struct xfs_log_item *intent,
+ struct xfs_trans *tp)
+{
+ struct xfs_efd_log_item *efdp;
+ struct xfs_efi_log_item *efip;
+ struct xfs_extent *extp;
+ unsigned int count;
+
+ count = EFI_ITEM(intent)->efi_format.efi_nextents;
+ extp = EFI_ITEM(intent)->efi_format.efi_extents;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ efdp = xfs_trans_get_efd(tp, EFI_ITEM(intent), count);
+ efdp->efd_next_extent = count;
+ memcpy(efdp->efd_format.efd_extents, extp, count * sizeof(*extp));
+ set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
+
+ efip = xfs_efi_init(tp->t_mountp, count);
+ memcpy(efip->efi_format.efi_extents, extp, count * sizeof(*extp));
+ atomic_set(&efip->efi_next_extent, count);
+ xfs_trans_add_item(tp, &efip->efi_item);
+ set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags);
+ return &efip->efi_item;
+}
+
static const struct xfs_item_ops xfs_efi_item_ops = {
.iop_size = xfs_efi_item_size,
.iop_format = xfs_efi_item_format,
@@ -656,6 +677,7 @@ static const struct xfs_item_ops xfs_efi_item_ops = {
.iop_release = xfs_efi_item_release,
.iop_recover = xfs_efi_item_recover,
.iop_match = xfs_efi_item_match,
+ .iop_relog = xfs_efi_item_relog,
};
/*
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 3d1b95124744..5b0f93f73837 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -32,6 +32,39 @@
static const struct vm_operations_struct xfs_file_vm_ops;
+/*
+ * Decide if the given file range is aligned to the size of the fundamental
+ * allocation unit for the file.
+ */
+static bool
+xfs_is_falloc_aligned(
+ struct xfs_inode *ip,
+ loff_t pos,
+ long long int len)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ uint64_t mask;
+
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ if (!is_power_of_2(mp->m_sb.sb_rextsize)) {
+ u64 rextbytes;
+ u32 mod;
+
+ rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
+ div_u64_rem(pos, rextbytes, &mod);
+ if (mod)
+ return false;
+ div_u64_rem(len, rextbytes, &mod);
+ return mod == 0;
+ }
+ mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1;
+ } else {
+ mask = mp->m_sb.sb_blocksize - 1;
+ }
+
+ return !((pos | len) & mask);
+}
+
int
xfs_update_prealloc_flags(
struct xfs_inode *ip,
@@ -850,9 +883,7 @@ xfs_file_fallocate(
if (error)
goto out_unlock;
} else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
- unsigned int blksize_mask = i_blocksize(inode) - 1;
-
- if (offset & blksize_mask || len & blksize_mask) {
+ if (!xfs_is_falloc_aligned(ip, offset, len)) {
error = -EINVAL;
goto out_unlock;
}
@@ -872,10 +903,9 @@ xfs_file_fallocate(
if (error)
goto out_unlock;
} else if (mode & FALLOC_FL_INSERT_RANGE) {
- unsigned int blksize_mask = i_blocksize(inode) - 1;
loff_t isize = i_size_read(inode);
- if (offset & blksize_mask || len & blksize_mask) {
+ if (!xfs_is_falloc_aligned(ip, offset, len)) {
error = -EINVAL;
goto out_unlock;
}
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 1a88025e68a3..db23e455eb91 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -33,39 +33,7 @@ enum xfs_fstrm_alloc {
/*
* Allocation group filestream associations are tracked with per-ag atomic
* counters. These counters allow xfs_filestream_pick_ag() to tell whether a
- * particular AG already has active filestreams associated with it. The mount
- * point's m_peraglock is used to protect these counters from per-ag array
- * re-allocation during a growfs operation. When xfs_growfs_data_private() is
- * about to reallocate the array, it calls xfs_filestream_flush() with the
- * m_peraglock held in write mode.
- *
- * Since xfs_mru_cache_flush() guarantees that all the free functions for all
- * the cache elements have finished executing before it returns, it's safe for
- * the free functions to use the atomic counters without m_peraglock protection.
- * This allows the implementation of xfs_fstrm_free_func() to be agnostic about
- * whether it was called with the m_peraglock held in read mode, write mode or
- * not held at all. The race condition this addresses is the following:
- *
- * - The work queue scheduler fires and pulls a filestream directory cache
- * element off the LRU end of the cache for deletion, then gets pre-empted.
- * - A growfs operation grabs the m_peraglock in write mode, flushes all the
- * remaining items from the cache and reallocates the mount point's per-ag
- * array, resetting all the counters to zero.
- * - The work queue thread resumes and calls the free function for the element
- * it started cleaning up earlier. In the process it decrements the
- * filestreams counter for an AG that now has no references.
- *
- * With a shrinkfs feature, the above scenario could panic the system.
- *
- * All other uses of the following macros should be protected by either the
- * m_peraglock held in read mode, or the cache's internal locking exposed by the
- * interval between a call to xfs_mru_cache_lookup() and a call to
- * xfs_mru_cache_done(). In addition, the m_peraglock must be held in read mode
- * when new elements are added to the cache.
- *
- * Combined, these locking rules ensure that no associations will ever exist in
- * the cache that reference per-ag array elements that have since been
- * reallocated.
+ * particular AG already has active filestreams associated with it.
*/
int
xfs_filestream_peek_ag(
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 4eebcec4aae6..9ce5e7d5bf8f 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -26,7 +26,7 @@
#include "xfs_rtalloc.h"
/* Convert an xfs_fsmap to an fsmap. */
-void
+static void
xfs_fsmap_from_internal(
struct fsmap *dest,
struct xfs_fsmap *src)
@@ -155,8 +155,7 @@ xfs_fsmap_owner_from_rmap(
/* getfsmap query state */
struct xfs_getfsmap_info {
struct xfs_fsmap_head *head;
- xfs_fsmap_format_t formatter; /* formatting fn */
- void *format_arg; /* format buffer */
+ struct fsmap *fsmap_recs; /* mapping records */
struct xfs_buf *agf_bp; /* AGF, for refcount queries */
xfs_daddr_t next_daddr; /* next daddr we expect */
u64 missing_owner; /* owner of holes */
@@ -224,6 +223,20 @@ xfs_getfsmap_is_shared(
return 0;
}
+static inline void
+xfs_getfsmap_format(
+ struct xfs_mount *mp,
+ struct xfs_fsmap *xfm,
+ struct xfs_getfsmap_info *info)
+{
+ struct fsmap *rec;
+
+ trace_xfs_getfsmap_mapping(mp, xfm);
+
+ rec = &info->fsmap_recs[info->head->fmh_entries++];
+ xfs_fsmap_from_internal(rec, xfm);
+}
+
/*
* Format a reverse mapping for getfsmap, having translated rm_startblock
* into the appropriate daddr units.
@@ -256,6 +269,9 @@ xfs_getfsmap_helper(
/* Are we just counting mappings? */
if (info->head->fmh_count == 0) {
+ if (info->head->fmh_entries == UINT_MAX)
+ return -ECANCELED;
+
if (rec_daddr > info->next_daddr)
info->head->fmh_entries++;
@@ -285,10 +301,7 @@ xfs_getfsmap_helper(
fmr.fmr_offset = 0;
fmr.fmr_length = rec_daddr - info->next_daddr;
fmr.fmr_flags = FMR_OF_SPECIAL_OWNER;
- error = info->formatter(&fmr, info->format_arg);
- if (error)
- return error;
- info->head->fmh_entries++;
+ xfs_getfsmap_format(mp, &fmr, info);
}
if (info->last)
@@ -320,11 +333,8 @@ xfs_getfsmap_helper(
if (shared)
fmr.fmr_flags |= FMR_OF_SHARED;
}
- error = info->formatter(&fmr, info->format_arg);
- if (error)
- return error;
- info->head->fmh_entries++;
+ xfs_getfsmap_format(mp, &fmr, info);
out:
rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
if (info->next_daddr < rec_daddr)
@@ -792,11 +802,11 @@ xfs_getfsmap_check_keys(
#endif /* CONFIG_XFS_RT */
/*
- * Get filesystem's extents as described in head, and format for
- * output. Calls formatter to fill the user's buffer until all
- * extents are mapped, until the passed-in head->fmh_count slots have
- * been filled, or until the formatter short-circuits the loop, if it
- * is tracking filled-in extents on its own.
+ * Get filesystem's extents as described in head, and format for output. Fills
+ * in the supplied records array until there are no more reverse mappings to
+ * return or head.fmh_entries == head.fmh_count. In the second case, this
+ * function returns -ECANCELED to indicate that more records would have been
+ * returned.
*
* Key to Confusion
* ----------------
@@ -816,8 +826,7 @@ int
xfs_getfsmap(
struct xfs_mount *mp,
struct xfs_fsmap_head *head,
- xfs_fsmap_format_t formatter,
- void *arg)
+ struct fsmap *fsmap_recs)
{
struct xfs_trans *tp = NULL;
struct xfs_fsmap dkeys[2]; /* per-dev keys */
@@ -892,8 +901,7 @@ xfs_getfsmap(
info.next_daddr = head->fmh_keys[0].fmr_physical +
head->fmh_keys[0].fmr_length;
- info.formatter = formatter;
- info.format_arg = arg;
+ info.fsmap_recs = fsmap_recs;
info.head = head;
/*
diff --git a/fs/xfs/xfs_fsmap.h b/fs/xfs/xfs_fsmap.h
index c6c57739b862..a0775788e7b1 100644
--- a/fs/xfs/xfs_fsmap.h
+++ b/fs/xfs/xfs_fsmap.h
@@ -27,13 +27,9 @@ struct xfs_fsmap_head {
struct xfs_fsmap fmh_keys[2]; /* low and high keys */
};
-void xfs_fsmap_from_internal(struct fsmap *dest, struct xfs_fsmap *src);
void xfs_fsmap_to_internal(struct xfs_fsmap *dest, struct fsmap *src);
-/* fsmap to userspace formatter - copy to user & advance pointer */
-typedef int (*xfs_fsmap_format_t)(struct xfs_fsmap *, void *);
-
int xfs_getfsmap(struct xfs_mount *mp, struct xfs_fsmap_head *head,
- xfs_fsmap_format_t formatter, void *arg);
+ struct fsmap *out_recs);
#endif /* __XFS_FSMAP_H__ */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 49624973eecc..2bfbcf28b1bd 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -698,6 +698,68 @@ out_unlock:
return error;
}
+/* Propagate di_flags from a parent inode to a child inode. */
+static void
+xfs_inode_inherit_flags(
+ struct xfs_inode *ip,
+ const struct xfs_inode *pip)
+{
+ unsigned int di_flags = 0;
+ umode_t mode = VFS_I(ip)->i_mode;
+
+ if (S_ISDIR(mode)) {
+ if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
+ di_flags |= XFS_DIFLAG_RTINHERIT;
+ if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
+ di_flags |= XFS_DIFLAG_EXTSZINHERIT;
+ ip->i_d.di_extsize = pip->i_d.di_extsize;
+ }
+ if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
+ di_flags |= XFS_DIFLAG_PROJINHERIT;
+ } else if (S_ISREG(mode)) {
+ if ((pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) &&
+ xfs_sb_version_hasrealtime(&ip->i_mount->m_sb))
+ di_flags |= XFS_DIFLAG_REALTIME;
+ if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
+ di_flags |= XFS_DIFLAG_EXTSIZE;
+ ip->i_d.di_extsize = pip->i_d.di_extsize;
+ }
+ }
+ if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
+ xfs_inherit_noatime)
+ di_flags |= XFS_DIFLAG_NOATIME;
+ if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
+ xfs_inherit_nodump)
+ di_flags |= XFS_DIFLAG_NODUMP;
+ if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
+ xfs_inherit_sync)
+ di_flags |= XFS_DIFLAG_SYNC;
+ if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
+ xfs_inherit_nosymlinks)
+ di_flags |= XFS_DIFLAG_NOSYMLINKS;
+ if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
+ xfs_inherit_nodefrag)
+ di_flags |= XFS_DIFLAG_NODEFRAG;
+ if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
+ di_flags |= XFS_DIFLAG_FILESTREAM;
+
+ ip->i_d.di_flags |= di_flags;
+}
+
+/* Propagate di_flags2 from a parent inode to a child inode. */
+static void
+xfs_inode_inherit_flags2(
+ struct xfs_inode *ip,
+ const struct xfs_inode *pip)
+{
+ if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) {
+ ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+ ip->i_d.di_cowextsize = pip->i_d.di_cowextsize;
+ }
+ if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
+ ip->i_d.di_flags2 |= XFS_DIFLAG2_DAX;
+}
+
/*
* Allocate an inode on disk and return a copy of its in-core version.
* The in-core inode is locked exclusively. Set mode, nlink, and rdev
@@ -841,54 +903,10 @@ xfs_ialloc(
break;
case S_IFREG:
case S_IFDIR:
- if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
- uint di_flags = 0;
-
- if (S_ISDIR(mode)) {
- if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
- di_flags |= XFS_DIFLAG_RTINHERIT;
- if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
- di_flags |= XFS_DIFLAG_EXTSZINHERIT;
- ip->i_d.di_extsize = pip->i_d.di_extsize;
- }
- if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
- di_flags |= XFS_DIFLAG_PROJINHERIT;
- } else if (S_ISREG(mode)) {
- if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
- di_flags |= XFS_DIFLAG_REALTIME;
- if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
- di_flags |= XFS_DIFLAG_EXTSIZE;
- ip->i_d.di_extsize = pip->i_d.di_extsize;
- }
- }
- if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
- xfs_inherit_noatime)
- di_flags |= XFS_DIFLAG_NOATIME;
- if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
- xfs_inherit_nodump)
- di_flags |= XFS_DIFLAG_NODUMP;
- if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
- xfs_inherit_sync)
- di_flags |= XFS_DIFLAG_SYNC;
- if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
- xfs_inherit_nosymlinks)
- di_flags |= XFS_DIFLAG_NOSYMLINKS;
- if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
- xfs_inherit_nodefrag)
- di_flags |= XFS_DIFLAG_NODEFRAG;
- if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
- di_flags |= XFS_DIFLAG_FILESTREAM;
-
- ip->i_d.di_flags |= di_flags;
- }
- if (pip && (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY)) {
- if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) {
- ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
- ip->i_d.di_cowextsize = pip->i_d.di_cowextsize;
- }
- if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
- ip->i_d.di_flags2 |= XFS_DIFLAG2_DAX;
- }
+ if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY))
+ xfs_inode_inherit_flags(ip, pip);
+ if (pip && (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY))
+ xfs_inode_inherit_flags2(ip, pip);
/* FALLTHROUGH */
case S_IFLNK:
ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
@@ -1516,17 +1534,10 @@ xfs_itruncate_extents_flags(
if (error)
goto out;
- /*
- * Duplicate the transaction that has the permanent
- * reservation and commit the old transaction.
- */
+ /* free the just unmapped extents */
error = xfs_defer_finish(&tp);
if (error)
goto out;
-
- error = xfs_trans_roll_inode(&tp, ip);
- if (error)
- goto out;
}
if (whichfork == XFS_DATA_FORK) {
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index bca7659fb5c6..3fbd98f61ea5 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1716,39 +1716,17 @@ out_free_buf:
return error;
}
-struct getfsmap_info {
- struct xfs_mount *mp;
- struct fsmap_head __user *data;
- unsigned int idx;
- __u32 last_flags;
-};
-
-STATIC int
-xfs_getfsmap_format(struct xfs_fsmap *xfm, void *priv)
-{
- struct getfsmap_info *info = priv;
- struct fsmap fm;
-
- trace_xfs_getfsmap_mapping(info->mp, xfm);
-
- info->last_flags = xfm->fmr_flags;
- xfs_fsmap_from_internal(&fm, xfm);
- if (copy_to_user(&info->data->fmh_recs[info->idx++], &fm,
- sizeof(struct fsmap)))
- return -EFAULT;
-
- return 0;
-}
-
STATIC int
xfs_ioc_getfsmap(
struct xfs_inode *ip,
struct fsmap_head __user *arg)
{
- struct getfsmap_info info = { NULL };
struct xfs_fsmap_head xhead = {0};
struct fsmap_head head;
- bool aborted = false;
+ struct fsmap *recs;
+ unsigned int count;
+ __u32 last_flags = 0;
+ bool done = false;
int error;
if (copy_from_user(&head, arg, sizeof(struct fsmap_head)))
@@ -1760,38 +1738,112 @@ xfs_ioc_getfsmap(
sizeof(head.fmh_keys[1].fmr_reserved)))
return -EINVAL;
+ /*
+ * Use an internal memory buffer so that we don't have to copy fsmap
+ * data to userspace while holding locks. Start by trying to allocate
+ * up to 128k for the buffer, but fall back to a single page if needed.
+ */
+ count = min_t(unsigned int, head.fmh_count,
+ 131072 / sizeof(struct fsmap));
+ recs = kvzalloc(count * sizeof(struct fsmap), GFP_KERNEL);
+ if (!recs) {
+ count = min_t(unsigned int, head.fmh_count,
+ PAGE_SIZE / sizeof(struct fsmap));
+ recs = kvzalloc(count * sizeof(struct fsmap), GFP_KERNEL);
+ if (!recs)
+ return -ENOMEM;
+ }
+
xhead.fmh_iflags = head.fmh_iflags;
- xhead.fmh_count = head.fmh_count;
xfs_fsmap_to_internal(&xhead.fmh_keys[0], &head.fmh_keys[0]);
xfs_fsmap_to_internal(&xhead.fmh_keys[1], &head.fmh_keys[1]);
trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]);
trace_xfs_getfsmap_high_key(ip->i_mount, &xhead.fmh_keys[1]);
- info.mp = ip->i_mount;
- info.data = arg;
- error = xfs_getfsmap(ip->i_mount, &xhead, xfs_getfsmap_format, &info);
- if (error == -ECANCELED) {
- error = 0;
- aborted = true;
- } else if (error)
- return error;
+ head.fmh_entries = 0;
+ do {
+ struct fsmap __user *user_recs;
+ struct fsmap *last_rec;
+
+ user_recs = &arg->fmh_recs[head.fmh_entries];
+ xhead.fmh_entries = 0;
+ xhead.fmh_count = min_t(unsigned int, count,
+ head.fmh_count - head.fmh_entries);
+
+ /* Run query, record how many entries we got. */
+ error = xfs_getfsmap(ip->i_mount, &xhead, recs);
+ switch (error) {
+ case 0:
+ /*
+ * There are no more records in the result set. Copy
+ * whatever we got to userspace and break out.
+ */
+ done = true;
+ break;
+ case -ECANCELED:
+ /*
+ * The internal memory buffer is full. Copy whatever
+ * records we got to userspace and go again if we have
+ * not yet filled the userspace buffer.
+ */
+ error = 0;
+ break;
+ default:
+ goto out_free;
+ }
+ head.fmh_entries += xhead.fmh_entries;
+ head.fmh_oflags = xhead.fmh_oflags;
- /* If we didn't abort, set the "last" flag in the last fmx */
- if (!aborted && info.idx) {
- info.last_flags |= FMR_OF_LAST;
- if (copy_to_user(&info.data->fmh_recs[info.idx - 1].fmr_flags,
- &info.last_flags, sizeof(info.last_flags)))
- return -EFAULT;
+ /*
+ * If the caller wanted a record count or there aren't any
+ * new records to return, we're done.
+ */
+ if (head.fmh_count == 0 || xhead.fmh_entries == 0)
+ break;
+
+ /* Copy all the records we got out to userspace. */
+ if (copy_to_user(user_recs, recs,
+ xhead.fmh_entries * sizeof(struct fsmap))) {
+ error = -EFAULT;
+ goto out_free;
+ }
+
+ /* Remember the last record flags we copied to userspace. */
+ last_rec = &recs[xhead.fmh_entries - 1];
+ last_flags = last_rec->fmr_flags;
+
+ /* Set up the low key for the next iteration. */
+ xfs_fsmap_to_internal(&xhead.fmh_keys[0], last_rec);
+ trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]);
+ } while (!done && head.fmh_entries < head.fmh_count);
+
+ /*
+ * If there are no more records in the query result set and we're not
+ * in counting mode, mark the last record returned with the LAST flag.
+ */
+ if (done && head.fmh_count > 0 && head.fmh_entries > 0) {
+ struct fsmap __user *user_rec;
+
+ last_flags |= FMR_OF_LAST;
+ user_rec = &arg->fmh_recs[head.fmh_entries - 1];
+
+ if (copy_to_user(&user_rec->fmr_flags, &last_flags,
+ sizeof(last_flags))) {
+ error = -EFAULT;
+ goto out_free;
+ }
}
/* copy back header */
- head.fmh_entries = xhead.fmh_entries;
- head.fmh_oflags = xhead.fmh_oflags;
- if (copy_to_user(arg, &head, sizeof(struct fsmap_head)))
- return -EFAULT;
+ if (copy_to_user(arg, &head, sizeof(struct fsmap_head))) {
+ error = -EFAULT;
+ goto out_free;
+ }
- return 0;
+out_free:
+ kmem_free(recs);
+ return error;
}
STATIC int
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 80a13c8561d8..5e165456da68 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -237,7 +237,7 @@ xfs_vn_create(
umode_t mode,
bool flags)
{
- return xfs_vn_mknod(dir, dentry, mode, 0);
+ return xfs_generic_create(dir, dentry, mode, 0, false);
}
STATIC int
@@ -246,7 +246,7 @@ xfs_vn_mkdir(
struct dentry *dentry,
umode_t mode)
{
- return xfs_vn_mknod(dir, dentry, mode|S_IFDIR, 0);
+ return xfs_generic_create(dir, dentry, mode | S_IFDIR, 0, false);
}
STATIC struct dentry *
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index ab737fed7b12..5b7a1e201559 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -123,7 +123,6 @@ typedef __u32 xfs_nlink_t;
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
#define EFSBADCRC EBADMSG /* Bad CRC detected */
-#define SYNCHRONIZE() barrier()
#define __return_address __builtin_return_address(0)
/*
@@ -176,6 +175,12 @@ static inline xfs_dev_t linux_to_xfs_dev_t(dev_t dev)
#define xfs_sort(a,n,s,fn) sort(a,n,s,fn,NULL)
#define xfs_stack_trace() dump_stack()
+static inline uint64_t rounddown_64(uint64_t x, uint32_t y)
+{
+ do_div(x, y);
+ return x * y;
+}
+
static inline uint64_t roundup_64(uint64_t x, uint32_t y)
{
x += y - 1;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index ad0c69ee8947..fa2d05e65ff1 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1475,14 +1475,14 @@ xlog_commit_record(
}
/*
- * Push on the buffer cache code if we ever use more than 75% of the on-disk
- * log space. This code pushes on the lsn which would supposedly free up
- * the 25% which we want to leave free. We may need to adopt a policy which
- * pushes on an lsn which is further along in the log once we reach the high
- * water mark. In this manner, we would be creating a low water mark.
+ * Compute the LSN that we'd need to push the log tail towards in order to have
+ * (a) enough on-disk log space to log the number of bytes specified, (b) at
+ * least 25% of the log space free, and (c) at least 256 blocks free. If the
+ * log free space already meets all three thresholds, this function returns
+ * NULLCOMMITLSN.
*/
-STATIC void
-xlog_grant_push_ail(
+xfs_lsn_t
+xlog_grant_push_threshold(
struct xlog *log,
int need_bytes)
{
@@ -1508,7 +1508,7 @@ xlog_grant_push_ail(
free_threshold = max(free_threshold, (log->l_logBBsize >> 2));
free_threshold = max(free_threshold, 256);
if (free_blocks >= free_threshold)
- return;
+ return NULLCOMMITLSN;
xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
&threshold_block);
@@ -1528,13 +1528,33 @@ xlog_grant_push_ail(
if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
threshold_lsn = last_sync_lsn;
+ return threshold_lsn;
+}
+
+/*
+ * Push the tail of the log if we need to do so to maintain the free log space
+ * thresholds set out by xlog_grant_push_threshold. We may need to adopt a
+ * policy which pushes on an lsn which is further along in the log once we
+ * reach the high water mark. In this manner, we would be creating a low water
+ * mark.
+ */
+STATIC void
+xlog_grant_push_ail(
+ struct xlog *log,
+ int need_bytes)
+{
+ xfs_lsn_t threshold_lsn;
+
+ threshold_lsn = xlog_grant_push_threshold(log, need_bytes);
+ if (threshold_lsn == NULLCOMMITLSN || XLOG_FORCED_SHUTDOWN(log))
+ return;
+
/*
* Get the transaction layer to kick the dirty buffers out to
* disk asynchronously. No point in trying to do this if
* the filesystem is shutting down.
*/
- if (!XLOG_FORCED_SHUTDOWN(log))
- xfs_ail_push(log->l_ailp, threshold_lsn);
+ xfs_ail_push(log->l_ailp, threshold_lsn);
}
/*
@@ -1604,9 +1624,7 @@ xlog_cksum(
int i;
int xheads;
- xheads = size / XLOG_HEADER_CYCLE_SIZE;
- if (size % XLOG_HEADER_CYCLE_SIZE)
- xheads++;
+ xheads = DIV_ROUND_UP(size, XLOG_HEADER_CYCLE_SIZE);
for (i = 1; i < xheads; i++) {
crc = crc32c(crc, &xhdr[i].hic_xheader,
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 1412d6993f1e..58c3fcbec94a 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -141,4 +141,6 @@ void xfs_log_quiesce(struct xfs_mount *mp);
bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
bool xfs_log_in_recovery(struct xfs_mount *);
+xfs_lsn_t xlog_grant_push_threshold(struct xlog *log, int need_bytes);
+
#endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index a17d788921d6..87886b7f77da 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -371,6 +371,19 @@ out:
return error;
}
+static inline int
+xlog_logrec_hblks(struct xlog *log, struct xlog_rec_header *rh)
+{
+ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+ int h_size = be32_to_cpu(rh->h_size);
+
+ if ((be32_to_cpu(rh->h_version) & XLOG_VERSION_2) &&
+ h_size > XLOG_HEADER_CYCLE_SIZE)
+ return DIV_ROUND_UP(h_size, XLOG_HEADER_CYCLE_SIZE);
+ }
+ return 1;
+}
+
/*
* Potentially backup over partial log record write.
*
@@ -463,15 +476,7 @@ xlog_find_verify_log_record(
* reset last_blk. Only when last_blk points in the middle of a log
* record do we update last_blk.
*/
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
- uint h_size = be32_to_cpu(head->h_size);
-
- xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
- if (h_size % XLOG_HEADER_CYCLE_SIZE)
- xhdrs++;
- } else {
- xhdrs = 1;
- }
+ xhdrs = xlog_logrec_hblks(log, head);
if (*last_blk - i + extra_bblks !=
BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
@@ -1158,22 +1163,7 @@ xlog_check_unmount_rec(
* below. We won't want to clear the unmount record if there is one, so
* we pass the lsn of the unmount record rather than the block after it.
*/
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
- int h_size = be32_to_cpu(rhead->h_size);
- int h_version = be32_to_cpu(rhead->h_version);
-
- if ((h_version & XLOG_VERSION_2) &&
- (h_size > XLOG_HEADER_CYCLE_SIZE)) {
- hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
- if (h_size % XLOG_HEADER_CYCLE_SIZE)
- hblks++;
- } else {
- hblks = 1;
- }
- } else {
- hblks = 1;
- }
-
+ hblks = xlog_logrec_hblks(log, rhead);
after_umount_blk = xlog_wrap_logbno(log,
rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len)));
@@ -2444,44 +2434,66 @@ xlog_recover_process_data(
/* Take all the collected deferred ops and finish them in order. */
static int
xlog_finish_defer_ops(
- struct xfs_trans *parent_tp)
+ struct xfs_mount *mp,
+ struct list_head *capture_list)
{
- struct xfs_mount *mp = parent_tp->t_mountp;
+ struct xfs_defer_capture *dfc, *next;
struct xfs_trans *tp;
- int64_t freeblks;
- uint resblks;
- int error;
+ struct xfs_inode *ip;
+ int error = 0;
- /*
- * We're finishing the defer_ops that accumulated as a result of
- * recovering unfinished intent items during log recovery. We
- * reserve an itruncate transaction because it is the largest
- * permanent transaction type. Since we're the only user of the fs
- * right now, take 93% (15/16) of the available free blocks. Use
- * weird math to avoid a 64-bit division.
- */
- freeblks = percpu_counter_sum(&mp->m_fdblocks);
- if (freeblks <= 0)
- return -ENOSPC;
- resblks = min_t(int64_t, UINT_MAX, freeblks);
- resblks = (resblks * 15) >> 4;
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
- 0, XFS_TRANS_RESERVE, &tp);
- if (error)
- return error;
- /* transfer all collected dfops to this transaction */
- xfs_defer_move(tp, parent_tp);
+ list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
+ struct xfs_trans_res resv;
- return xfs_trans_commit(tp);
+ /*
+ * Create a new transaction reservation from the captured
+ * information. Set logcount to 1 to force the new transaction
+ * to regrant every roll so that we can make forward progress
+ * in recovery no matter how full the log might be.
+ */
+ resv.tr_logres = dfc->dfc_logres;
+ resv.tr_logcount = 1;
+ resv.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+
+ error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres,
+ dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp);
+ if (error)
+ return error;
+
+ /*
+ * Transfer to this new transaction all the dfops we captured
+ * from recovering a single intent item.
+ */
+ list_del_init(&dfc->dfc_list);
+ xfs_defer_ops_continue(dfc, tp, &ip);
+
+ error = xfs_trans_commit(tp);
+ if (ip) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_irele(ip);
+ }
+ if (error)
+ return error;
+ }
+
+ ASSERT(list_empty(capture_list));
+ return 0;
}
-/* Is this log item a deferred action intent? */
-static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
+/* Release all the captured defer ops and capture structures in this list. */
+static void
+xlog_abort_defer_ops(
+ struct xfs_mount *mp,
+ struct list_head *capture_list)
{
- return lip->li_ops->iop_recover != NULL &&
- lip->li_ops->iop_match != NULL;
-}
+ struct xfs_defer_capture *dfc;
+ struct xfs_defer_capture *next;
+ list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
+ list_del_init(&dfc->dfc_list);
+ xfs_defer_ops_release(mp, dfc);
+ }
+}
/*
* When this is called, all of the log intent items which did not have
* corresponding log done items should be in the AIL. What we do now
@@ -2502,35 +2514,23 @@ STATIC int
xlog_recover_process_intents(
struct xlog *log)
{
- struct xfs_trans *parent_tp;
+ LIST_HEAD(capture_list);
struct xfs_ail_cursor cur;
struct xfs_log_item *lip;
struct xfs_ail *ailp;
- int error;
+ int error = 0;
#if defined(DEBUG) || defined(XFS_WARN)
xfs_lsn_t last_lsn;
#endif
- /*
- * The intent recovery handlers commit transactions to complete recovery
- * for individual intents, but any new deferred operations that are
- * queued during that process are held off until the very end. The
- * purpose of this transaction is to serve as a container for deferred
- * operations. Each intent recovery handler must transfer dfops here
- * before its local transaction commits, and we'll finish the entire
- * list below.
- */
- error = xfs_trans_alloc_empty(log->l_mp, &parent_tp);
- if (error)
- return error;
-
ailp = log->l_ailp;
spin_lock(&ailp->ail_lock);
- lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
#if defined(DEBUG) || defined(XFS_WARN)
last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
#endif
- while (lip != NULL) {
+ for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+ lip != NULL;
+ lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
/*
* We're done when we see something other than an intent.
* There should be no intents left in the AIL now.
@@ -2552,26 +2552,29 @@ xlog_recover_process_intents(
/*
* NOTE: If your intent processing routine can create more
- * deferred ops, you /must/ attach them to the transaction in
- * this routine or else those subsequent intents will get
+ * deferred ops, you /must/ attach them to the capture list in
+ * the recover routine or else those subsequent intents will be
* replayed in the wrong order!
*/
- if (!test_and_set_bit(XFS_LI_RECOVERED, &lip->li_flags)) {
- spin_unlock(&ailp->ail_lock);
- error = lip->li_ops->iop_recover(lip, parent_tp);
- spin_lock(&ailp->ail_lock);
- }
+ spin_unlock(&ailp->ail_lock);
+ error = lip->li_ops->iop_recover(lip, &capture_list);
+ spin_lock(&ailp->ail_lock);
if (error)
- goto out;
- lip = xfs_trans_ail_cursor_next(ailp, &cur);
+ break;
}
-out:
+
xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->ail_lock);
- if (!error)
- error = xlog_finish_defer_ops(parent_tp);
- xfs_trans_cancel(parent_tp);
+ if (error)
+ goto err;
+ error = xlog_finish_defer_ops(log->l_mp, &capture_list);
+ if (error)
+ goto err;
+
+ return 0;
+err:
+ xlog_abort_defer_ops(log->l_mp, &capture_list);
return error;
}
@@ -2878,7 +2881,8 @@ STATIC int
xlog_valid_rec_header(
struct xlog *log,
struct xlog_rec_header *rhead,
- xfs_daddr_t blkno)
+ xfs_daddr_t blkno,
+ int bufsize)
{
int hlen;
@@ -2894,10 +2898,14 @@ xlog_valid_rec_header(
return -EFSCORRUPTED;
}
- /* LR body must have data or it wouldn't have been written */
+ /*
+ * LR body must have data (or it wouldn't have been written)
+ * and h_len must not be greater than LR buffer size.
+ */
hlen = be32_to_cpu(rhead->h_len);
- if (XFS_IS_CORRUPT(log->l_mp, hlen <= 0 || hlen > INT_MAX))
+ if (XFS_IS_CORRUPT(log->l_mp, hlen <= 0 || hlen > bufsize))
return -EFSCORRUPTED;
+
if (XFS_IS_CORRUPT(log->l_mp,
blkno > log->l_logBBsize || blkno > INT_MAX))
return -EFSCORRUPTED;
@@ -2958,9 +2966,6 @@ xlog_do_recovery_pass(
goto bread_err1;
rhead = (xlog_rec_header_t *)offset;
- error = xlog_valid_rec_header(log, rhead, tail_blk);
- if (error)
- goto bread_err1;
/*
* xfsprogs has a bug where record length is based on lsunit but
@@ -2975,30 +2980,22 @@ xlog_do_recovery_pass(
*/
h_size = be32_to_cpu(rhead->h_size);
h_len = be32_to_cpu(rhead->h_len);
- if (h_len > h_size) {
- if (h_len <= log->l_mp->m_logbsize &&
- be32_to_cpu(rhead->h_num_logops) == 1) {
- xfs_warn(log->l_mp,
+ if (h_len > h_size && h_len <= log->l_mp->m_logbsize &&
+ rhead->h_num_logops == cpu_to_be32(1)) {
+ xfs_warn(log->l_mp,
"invalid iclog size (%d bytes), using lsunit (%d bytes)",
- h_size, log->l_mp->m_logbsize);
- h_size = log->l_mp->m_logbsize;
- } else {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW,
- log->l_mp);
- error = -EFSCORRUPTED;
- goto bread_err1;
- }
+ h_size, log->l_mp->m_logbsize);
+ h_size = log->l_mp->m_logbsize;
}
- if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
- (h_size > XLOG_HEADER_CYCLE_SIZE)) {
- hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
- if (h_size % XLOG_HEADER_CYCLE_SIZE)
- hblks++;
+ error = xlog_valid_rec_header(log, rhead, tail_blk, h_size);
+ if (error)
+ goto bread_err1;
+
+ hblks = xlog_logrec_hblks(log, rhead);
+ if (hblks != 1) {
kmem_free(hbp);
hbp = xlog_alloc_buffer(log, hblks);
- } else {
- hblks = 1;
}
} else {
ASSERT(log->l_sectBBsize == 1);
@@ -3070,7 +3067,7 @@ xlog_do_recovery_pass(
}
rhead = (xlog_rec_header_t *)offset;
error = xlog_valid_rec_header(log, rhead,
- split_hblks ? blk_no : 0);
+ split_hblks ? blk_no : 0, h_size);
if (error)
goto bread_err2;
@@ -3151,7 +3148,7 @@ xlog_do_recovery_pass(
goto bread_err2;
rhead = (xlog_rec_header_t *)offset;
- error = xlog_valid_rec_header(log, rhead, blk_no);
+ error = xlog_valid_rec_header(log, rhead, blk_no, h_size);
if (error)
goto bread_err2;
@@ -3449,6 +3446,14 @@ xlog_recover_finish(
int error;
error = xlog_recover_process_intents(log);
if (error) {
+ /*
+ * Cancel all the unprocessed intent items now so that
+ * we don't leave them pinned in the AIL. This can
+ * cause the AIL to livelock on the pinned item if
+ * anyone tries to push the AIL (inode reclaim does
+ * this) before we get around to xfs_log_mount_cancel.
+ */
+ xlog_recover_cancel_intents(log);
xfs_alert(log->l_mp, "Failed to recover intents");
return error;
}
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 3f82e0c92c2d..b2a9abee8b2b 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -249,7 +249,6 @@ xfs_qm_unmount_quotas(
STATIC int
xfs_qm_dqattach_one(
struct xfs_inode *ip,
- xfs_dqid_t id,
xfs_dqtype_t type,
bool doalloc,
struct xfs_dquot **IO_idqpp)
@@ -330,23 +329,23 @@ xfs_qm_dqattach_locked(
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) {
- error = xfs_qm_dqattach_one(ip, i_uid_read(VFS_I(ip)),
- XFS_DQTYPE_USER, doalloc, &ip->i_udquot);
+ error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_USER,
+ doalloc, &ip->i_udquot);
if (error)
goto done;
ASSERT(ip->i_udquot);
}
if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) {
- error = xfs_qm_dqattach_one(ip, i_gid_read(VFS_I(ip)),
- XFS_DQTYPE_GROUP, doalloc, &ip->i_gdquot);
+ error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_GROUP,
+ doalloc, &ip->i_gdquot);
if (error)
goto done;
ASSERT(ip->i_gdquot);
}
if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) {
- error = xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQTYPE_PROJ,
+ error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_PROJ,
doalloc, &ip->i_pdquot);
if (error)
goto done;
@@ -1663,6 +1662,7 @@ xfs_qm_vop_dqalloc(
}
if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) {
+ ASSERT(O_udqpp);
if (!uid_eq(inode->i_uid, uid)) {
/*
* What we need is the dquot that has this uid, and
@@ -1696,6 +1696,7 @@ xfs_qm_vop_dqalloc(
}
}
if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
+ ASSERT(O_gdqpp);
if (!gid_eq(inode->i_gid, gid)) {
xfs_iunlock(ip, lockflags);
error = xfs_qm_dqget(mp, from_kgid(user_ns, gid),
@@ -1713,9 +1714,10 @@ xfs_qm_vop_dqalloc(
}
}
if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
+ ASSERT(O_pdqpp);
if (ip->i_d.di_projid != prid) {
xfs_iunlock(ip, lockflags);
- error = xfs_qm_dqget(mp, (xfs_dqid_t)prid,
+ error = xfs_qm_dqget(mp, prid,
XFS_DQTYPE_PROJ, true, &pq);
if (error) {
ASSERT(error != -ENOENT);
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index ca93b6488377..7529eb63ce94 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -424,7 +424,7 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
STATIC int
xfs_cui_item_recover(
struct xfs_log_item *lip,
- struct xfs_trans *parent_tp)
+ struct list_head *capture_list)
{
struct xfs_bmbt_irec irec;
struct xfs_cui_log_item *cuip = CUI_ITEM(lip);
@@ -432,7 +432,7 @@ xfs_cui_item_recover(
struct xfs_cud_log_item *cudp;
struct xfs_trans *tp;
struct xfs_btree_cur *rcur = NULL;
- struct xfs_mount *mp = parent_tp->t_mountp;
+ struct xfs_mount *mp = lip->li_mountp;
xfs_fsblock_t startblock_fsb;
xfs_fsblock_t new_fsb;
xfs_extlen_t new_len;
@@ -467,14 +467,8 @@ xfs_cui_item_recover(
refc->pe_len == 0 ||
startblock_fsb >= mp->m_sb.sb_dblocks ||
refc->pe_len >= mp->m_sb.sb_agblocks ||
- (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS)) {
- /*
- * This will pull the CUI from the AIL and
- * free the memory associated with it.
- */
- xfs_cui_release(cuip);
+ (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS))
return -EFSCORRUPTED;
- }
}
/*
@@ -493,12 +487,7 @@ xfs_cui_item_recover(
mp->m_refc_maxlevels * 2, 0, XFS_TRANS_RESERVE, &tp);
if (error)
return error;
- /*
- * Recovery stashes all deferred ops during intent processing and
- * finishes them on completion. Transfer current dfops state to this
- * transaction and transfer the result back before we return.
- */
- xfs_defer_move(tp, parent_tp);
+
cudp = xfs_trans_get_cud(tp, cuip);
for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
@@ -555,13 +544,10 @@ xfs_cui_item_recover(
}
xfs_refcount_finish_one_cleanup(tp, rcur, error);
- xfs_defer_move(parent_tp, tp);
- error = xfs_trans_commit(tp);
- return error;
+ return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
abort_error:
xfs_refcount_finish_one_cleanup(tp, rcur, error);
- xfs_defer_move(parent_tp, tp);
xfs_trans_cancel(tp);
return error;
}
@@ -574,6 +560,32 @@ xfs_cui_item_match(
return CUI_ITEM(lip)->cui_format.cui_id == intent_id;
}
+/* Relog an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_cui_item_relog(
+ struct xfs_log_item *intent,
+ struct xfs_trans *tp)
+{
+ struct xfs_cud_log_item *cudp;
+ struct xfs_cui_log_item *cuip;
+ struct xfs_phys_extent *extp;
+ unsigned int count;
+
+ count = CUI_ITEM(intent)->cui_format.cui_nextents;
+ extp = CUI_ITEM(intent)->cui_format.cui_extents;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ cudp = xfs_trans_get_cud(tp, CUI_ITEM(intent));
+ set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
+
+ cuip = xfs_cui_init(tp->t_mountp, count);
+ memcpy(cuip->cui_format.cui_extents, extp, count * sizeof(*extp));
+ atomic_set(&cuip->cui_next_extent, count);
+ xfs_trans_add_item(tp, &cuip->cui_item);
+ set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
+ return &cuip->cui_item;
+}
+
static const struct xfs_item_ops xfs_cui_item_ops = {
.iop_size = xfs_cui_item_size,
.iop_format = xfs_cui_item_format,
@@ -581,6 +593,7 @@ static const struct xfs_item_ops xfs_cui_item_ops = {
.iop_release = xfs_cui_item_release,
.iop_recover = xfs_cui_item_recover,
.iop_match = xfs_cui_item_match,
+ .iop_relog = xfs_cui_item_relog,
};
/*
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index dc5b0753cd51..7adc996ca6e3 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -467,14 +467,14 @@ const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
STATIC int
xfs_rui_item_recover(
struct xfs_log_item *lip,
- struct xfs_trans *parent_tp)
+ struct list_head *capture_list)
{
struct xfs_rui_log_item *ruip = RUI_ITEM(lip);
struct xfs_map_extent *rmap;
struct xfs_rud_log_item *rudp;
struct xfs_trans *tp;
struct xfs_btree_cur *rcur = NULL;
- struct xfs_mount *mp = parent_tp->t_mountp;
+ struct xfs_mount *mp = lip->li_mountp;
xfs_fsblock_t startblock_fsb;
enum xfs_rmap_intent_type type;
xfs_exntst_t state;
@@ -511,14 +511,8 @@ xfs_rui_item_recover(
rmap->me_len == 0 ||
startblock_fsb >= mp->m_sb.sb_dblocks ||
rmap->me_len >= mp->m_sb.sb_agblocks ||
- (rmap->me_flags & ~XFS_RMAP_EXTENT_FLAGS)) {
- /*
- * This will pull the RUI from the AIL and
- * free the memory associated with it.
- */
- xfs_rui_release(ruip);
+ (rmap->me_flags & ~XFS_RMAP_EXTENT_FLAGS))
return -EFSCORRUPTED;
- }
}
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
@@ -573,8 +567,7 @@ xfs_rui_item_recover(
}
xfs_rmap_finish_one_cleanup(tp, rcur, error);
- error = xfs_trans_commit(tp);
- return error;
+ return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
abort_error:
xfs_rmap_finish_one_cleanup(tp, rcur, error);
@@ -590,6 +583,32 @@ xfs_rui_item_match(
return RUI_ITEM(lip)->rui_format.rui_id == intent_id;
}
+/* Relog an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_rui_item_relog(
+ struct xfs_log_item *intent,
+ struct xfs_trans *tp)
+{
+ struct xfs_rud_log_item *rudp;
+ struct xfs_rui_log_item *ruip;
+ struct xfs_map_extent *extp;
+ unsigned int count;
+
+ count = RUI_ITEM(intent)->rui_format.rui_nextents;
+ extp = RUI_ITEM(intent)->rui_format.rui_extents;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ rudp = xfs_trans_get_rud(tp, RUI_ITEM(intent));
+ set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
+
+ ruip = xfs_rui_init(tp->t_mountp, count);
+ memcpy(ruip->rui_format.rui_extents, extp, count * sizeof(*extp));
+ atomic_set(&ruip->rui_next_extent, count);
+ xfs_trans_add_item(tp, &ruip->rui_item);
+ set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
+ return &ruip->rui_item;
+}
+
static const struct xfs_item_ops xfs_rui_item_ops = {
.iop_size = xfs_rui_item_size,
.iop_format = xfs_rui_item_format,
@@ -597,6 +616,7 @@ static const struct xfs_item_ops xfs_rui_item_ops = {
.iop_release = xfs_rui_item_release,
.iop_recover = xfs_rui_item_recover,
.iop_match = xfs_rui_item_match,
+ .iop_relog = xfs_rui_item_relog,
};
/*
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 5b89c12f1566..ede1baf31413 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -18,7 +18,7 @@
#include "xfs_trans_space.h"
#include "xfs_icache.h"
#include "xfs_rtalloc.h"
-
+#include "xfs_sb.h"
/*
* Read and return the summary information for a given extent size,
@@ -778,8 +778,14 @@ xfs_growfs_rt_alloc(
struct xfs_bmbt_irec map; /* block map output */
int nmap; /* number of block maps */
int resblks; /* space reservation */
+ enum xfs_blft buf_type;
struct xfs_trans *tp;
+ if (ip == mp->m_rsumip)
+ buf_type = XFS_BLFT_RTSUMMARY_BUF;
+ else
+ buf_type = XFS_BLFT_RTBITMAP_BUF;
+
/*
* Allocate space to the file, as necessary.
*/
@@ -841,6 +847,9 @@ xfs_growfs_rt_alloc(
mp->m_bsize, 0, &bp);
if (error)
goto out_trans_cancel;
+
+ xfs_trans_buf_set_type(tp, bp, buf_type);
+ bp->b_ops = &xfs_rtbuf_ops;
memset(bp->b_addr, 0, mp->m_sb.sb_blocksize);
xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
/*
@@ -1015,23 +1024,29 @@ xfs_growfs_rt(
/*
* Lock out other callers by grabbing the bitmap inode lock.
*/
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
/*
- * Update the bitmap inode's size.
+ * Update the bitmap inode's size ondisk and incore. We need
+ * to update the incore size so that inode inactivation won't
+ * punch what it thinks are "posteof" blocks.
*/
mp->m_rbmip->i_d.di_size =
nsbp->sb_rbmblocks * nsbp->sb_blocksize;
+ i_size_write(VFS_I(mp->m_rbmip), mp->m_rbmip->i_d.di_size);
xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
/*
* Get the summary inode into the transaction.
*/
- xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL);
+ xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
/*
- * Update the summary inode's size.
+ * Update the summary inode's size. We need to update the
+ * incore size so that inode inactivation won't punch what it
+ * thinks are "posteof" blocks.
*/
mp->m_rsumip->i_d.di_size = nmp->m_rsumsize;
+ i_size_write(VFS_I(mp->m_rsumip), mp->m_rsumip->i_d.di_size);
xfs_trans_log_inode(tp, mp->m_rsumip, XFS_ILOG_CORE);
/*
* Copy summary data from old to new sizes.
@@ -1087,7 +1102,13 @@ error_cancel:
if (error)
break;
}
+ if (error)
+ goto out_free;
+
+ /* Update secondary superblocks now the physical grow has completed */
+ error = xfs_update_secondary_sbs(mp);
+out_free:
/*
* Free the fake mp structure.
*/
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index f70f1255220b..20e0534a772c 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -23,6 +23,7 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
uint64_t xs_xstrat_bytes = 0;
uint64_t xs_write_bytes = 0;
uint64_t xs_read_bytes = 0;
+ uint64_t defer_relog = 0;
static const struct xstats_entry {
char *desc;
@@ -70,10 +71,13 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
xs_xstrat_bytes += per_cpu_ptr(stats, i)->s.xs_xstrat_bytes;
xs_write_bytes += per_cpu_ptr(stats, i)->s.xs_write_bytes;
xs_read_bytes += per_cpu_ptr(stats, i)->s.xs_read_bytes;
+ defer_relog += per_cpu_ptr(stats, i)->s.defer_relog;
}
len += scnprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n",
xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
+ len += scnprintf(buf + len, PATH_MAX-len, "defer_relog %llu\n",
+ defer_relog);
len += scnprintf(buf + len, PATH_MAX-len, "debug %u\n",
#if defined(DEBUG)
1);
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index 34d704f703d2..43ffba74f045 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -137,6 +137,7 @@ struct __xfsstats {
uint64_t xs_xstrat_bytes;
uint64_t xs_write_bytes;
uint64_t xs_read_bytes;
+ uint64_t defer_relog;
};
#define xfsstats_offset(f) (offsetof(struct __xfsstats, f)/sizeof(uint32_t))
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index baf5de30eebb..e3e229e52512 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -794,8 +794,7 @@ xfs_fs_statfs(
statp->f_namelen = MAXNAMELEN - 1;
id = huge_encode_dev(mp->m_ddev_targp->bt_dev);
- statp->f_fsid.val[0] = (u32)id;
- statp->f_fsid.val[1] = (u32)(id >> 32);
+ statp->f_fsid = u64_to_fsid(id);
icount = percpu_counter_sum(&mp->m_icount);
ifree = percpu_counter_sum(&mp->m_ifree);
@@ -1234,25 +1233,12 @@ xfs_fc_parse_param(
case Opt_nouuid:
mp->m_flags |= XFS_MOUNT_NOUUID;
return 0;
- case Opt_ikeep:
- mp->m_flags |= XFS_MOUNT_IKEEP;
- return 0;
- case Opt_noikeep:
- mp->m_flags &= ~XFS_MOUNT_IKEEP;
- return 0;
case Opt_largeio:
mp->m_flags |= XFS_MOUNT_LARGEIO;
return 0;
case Opt_nolargeio:
mp->m_flags &= ~XFS_MOUNT_LARGEIO;
return 0;
- case Opt_attr2:
- mp->m_flags |= XFS_MOUNT_ATTR2;
- return 0;
- case Opt_noattr2:
- mp->m_flags &= ~XFS_MOUNT_ATTR2;
- mp->m_flags |= XFS_MOUNT_NOATTR2;
- return 0;
case Opt_filestreams:
mp->m_flags |= XFS_MOUNT_FILESTREAMS;
return 0;
@@ -1304,6 +1290,24 @@ xfs_fc_parse_param(
xfs_mount_set_dax_mode(mp, result.uint_32);
return 0;
#endif
+ /* Following mount options will be removed in September 2025 */
+ case Opt_ikeep:
+ xfs_warn(mp, "%s mount option is deprecated.", param->key);
+ mp->m_flags |= XFS_MOUNT_IKEEP;
+ return 0;
+ case Opt_noikeep:
+ xfs_warn(mp, "%s mount option is deprecated.", param->key);
+ mp->m_flags &= ~XFS_MOUNT_IKEEP;
+ return 0;
+ case Opt_attr2:
+ xfs_warn(mp, "%s mount option is deprecated.", param->key);
+ mp->m_flags |= XFS_MOUNT_ATTR2;
+ return 0;
+ case Opt_noattr2:
+ xfs_warn(mp, "%s mount option is deprecated.", param->key);
+ mp->m_flags &= ~XFS_MOUNT_ATTR2;
+ mp->m_flags |= XFS_MOUNT_NOATTR2;
+ return 0;
default:
xfs_warn(mp, "unknown mount option [%s].", param->key);
return -EINVAL;
@@ -1450,6 +1454,19 @@ xfs_fc_fill_super(
if (error)
goto out_free_sb;
+ /* V4 support is undergoing deprecation. */
+ if (!xfs_sb_version_hascrc(&mp->m_sb)) {
+#ifdef CONFIG_XFS_SUPPORT_V4
+ xfs_warn_once(mp,
+ "Deprecated V4 format (crc=0) will not be supported after September 2030.");
+#else
+ xfs_warn(mp,
+ "Deprecated V4 format (crc=0) not supported by kernel.");
+ error = -EINVAL;
+ goto out_free_sb;
+#endif
+ }
+
/*
* XFS block mappings use 54 bits to store the logical block offset.
* This should suffice to handle the maximum file size that the VFS
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index 021ef96d0542..fac9de7ee6d0 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -50,13 +50,45 @@ xfs_panic_mask_proc_handler(
}
#endif /* CONFIG_PROC_FS */
+STATIC int
+xfs_deprecate_irix_sgid_inherit_proc_handler(
+ struct ctl_table *ctl,
+ int write,
+ void *buffer,
+ size_t *lenp,
+ loff_t *ppos)
+{
+ if (write) {
+ printk_once(KERN_WARNING
+ "XFS: " "%s sysctl option is deprecated.\n",
+ ctl->procname);
+ }
+ return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
+}
+
+STATIC int
+xfs_deprecate_irix_symlink_mode_proc_handler(
+ struct ctl_table *ctl,
+ int write,
+ void *buffer,
+ size_t *lenp,
+ loff_t *ppos)
+{
+ if (write) {
+ printk_once(KERN_WARNING
+ "XFS: " "%s sysctl option is deprecated.\n",
+ ctl->procname);
+ }
+ return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
+}
+
static struct ctl_table xfs_table[] = {
{
.procname = "irix_sgid_inherit",
.data = &xfs_params.sgid_inherit.val,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = xfs_deprecate_irix_sgid_inherit_proc_handler,
.extra1 = &xfs_params.sgid_inherit.min,
.extra2 = &xfs_params.sgid_inherit.max
},
@@ -65,7 +97,7 @@ static struct ctl_table xfs_table[] = {
.data = &xfs_params.symlink_mode.val,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = xfs_deprecate_irix_symlink_mode_proc_handler,
.extra1 = &xfs_params.symlink_mode.min,
.extra2 = &xfs_params.symlink_mode.max
},
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index dcdcf99cfa5d..86951652d3ed 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -2533,6 +2533,7 @@ DEFINE_DEFER_PENDING_EVENT(xfs_defer_create_intent);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent);
#define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT
DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index ca18a040336a..c94e71f741b6 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -959,7 +959,7 @@ xfs_trans_cancel(
struct xfs_log_item *lip;
list_for_each_entry(lip, &tp->t_items, li_trans)
- ASSERT(!(lip->li_type == XFS_LI_EFD));
+ ASSERT(!xlog_item_is_intent_done(lip));
}
#endif
xfs_trans_unreserve_and_mod_sb(tp);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index f46534b75236..084658946cc8 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -55,14 +55,12 @@ struct xfs_log_item {
#define XFS_LI_ABORTED 1
#define XFS_LI_FAILED 2
#define XFS_LI_DIRTY 3 /* log item dirty in transaction */
-#define XFS_LI_RECOVERED 4 /* log intent item has been recovered */
#define XFS_LI_FLAGS \
{ (1 << XFS_LI_IN_AIL), "IN_AIL" }, \
{ (1 << XFS_LI_ABORTED), "ABORTED" }, \
{ (1 << XFS_LI_FAILED), "FAILED" }, \
- { (1 << XFS_LI_DIRTY), "DIRTY" }, \
- { (1 << XFS_LI_RECOVERED), "RECOVERED" }
+ { (1 << XFS_LI_DIRTY), "DIRTY" }
struct xfs_item_ops {
unsigned flags;
@@ -74,10 +72,29 @@ struct xfs_item_ops {
void (*iop_committing)(struct xfs_log_item *, xfs_lsn_t commit_lsn);
void (*iop_release)(struct xfs_log_item *);
xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t);
- int (*iop_recover)(struct xfs_log_item *lip, struct xfs_trans *tp);
+ int (*iop_recover)(struct xfs_log_item *lip,
+ struct list_head *capture_list);
bool (*iop_match)(struct xfs_log_item *item, uint64_t id);
+ struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent,
+ struct xfs_trans *tp);
};
+/* Is this log item a deferred action intent? */
+static inline bool
+xlog_item_is_intent(struct xfs_log_item *lip)
+{
+ return lip->li_ops->iop_recover != NULL &&
+ lip->li_ops->iop_match != NULL;
+}
+
+/* Is this a log intent-done item? */
+static inline bool
+xlog_item_is_intent_done(struct xfs_log_item *lip)
+{
+ return lip->li_ops->iop_unpin == NULL &&
+ lip->li_ops->iop_push == NULL;
+}
+
/*
* Release the log item as soon as committed. This is for items just logging
* intents that never need to be written back in place.
@@ -243,4 +260,12 @@ void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp,
extern kmem_zone_t *xfs_trans_zone;
+static inline struct xfs_log_item *
+xfs_trans_item_relog(
+ struct xfs_log_item *lip,
+ struct xfs_trans *tp)
+{
+ return lip->li_ops->iop_relog(lip, tp);
+}
+
#endif /* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 133fc6fc3edd..fe45b0c3970c 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -221,36 +221,27 @@ xfs_trans_mod_dquot(
}
switch (field) {
-
- /*
- * regular disk blk reservation
- */
- case XFS_TRANS_DQ_RES_BLKS:
+ /* regular disk blk reservation */
+ case XFS_TRANS_DQ_RES_BLKS:
qtrx->qt_blk_res += delta;
break;
- /*
- * inode reservation
- */
- case XFS_TRANS_DQ_RES_INOS:
+ /* inode reservation */
+ case XFS_TRANS_DQ_RES_INOS:
qtrx->qt_ino_res += delta;
break;
- /*
- * disk blocks used.
- */
- case XFS_TRANS_DQ_BCOUNT:
+ /* disk blocks used. */
+ case XFS_TRANS_DQ_BCOUNT:
qtrx->qt_bcount_delta += delta;
break;
- case XFS_TRANS_DQ_DELBCOUNT:
+ case XFS_TRANS_DQ_DELBCOUNT:
qtrx->qt_delbcnt_delta += delta;
break;
- /*
- * Inode Count
- */
- case XFS_TRANS_DQ_ICOUNT:
+ /* Inode Count */
+ case XFS_TRANS_DQ_ICOUNT:
if (qtrx->qt_ino_res && delta > 0) {
qtrx->qt_ino_res_used += delta;
ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used);
@@ -258,17 +249,13 @@ xfs_trans_mod_dquot(
qtrx->qt_icount_delta += delta;
break;
- /*
- * rtblk reservation
- */
- case XFS_TRANS_DQ_RES_RTBLKS:
+ /* rtblk reservation */
+ case XFS_TRANS_DQ_RES_RTBLKS:
qtrx->qt_rtblk_res += delta;
break;
- /*
- * rtblk count
- */
- case XFS_TRANS_DQ_RTBCOUNT:
+ /* rtblk count */
+ case XFS_TRANS_DQ_RTBCOUNT:
if (qtrx->qt_rtblk_res && delta > 0) {
qtrx->qt_rtblk_res_used += delta;
ASSERT(qtrx->qt_rtblk_res >= qtrx->qt_rtblk_res_used);
@@ -276,11 +263,11 @@ xfs_trans_mod_dquot(
qtrx->qt_rtbcount_delta += delta;
break;
- case XFS_TRANS_DQ_DELRTBCOUNT:
+ case XFS_TRANS_DQ_DELRTBCOUNT:
qtrx->qt_delrtb_delta += delta;
break;
- default:
+ default:
ASSERT(0);
}
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 8ec7c8f109d7..ff5930be096c 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -24,6 +24,39 @@
#include "zonefs.h"
+static inline int zonefs_zone_mgmt(struct inode *inode,
+ enum req_opf op)
+{
+ struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ int ret;
+
+ lockdep_assert_held(&zi->i_truncate_mutex);
+
+ ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector,
+ zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS);
+ if (ret) {
+ zonefs_err(inode->i_sb,
+ "Zone management operation %s at %llu failed %d\n",
+ blk_op_str(op), zi->i_zsector, ret);
+ return ret;
+ }
+
+ return 0;
+}
+
+static inline void zonefs_i_size_write(struct inode *inode, loff_t isize)
+{
+ struct zonefs_inode_info *zi = ZONEFS_I(inode);
+
+ i_size_write(inode, isize);
+ /*
+ * A full zone is no longer open/active and does not need
+ * explicit closing.
+ */
+ if (isize >= zi->i_max_size)
+ zi->i_flags &= ~ZONEFS_ZONE_OPEN;
+}
+
static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned int flags, struct iomap *iomap,
struct iomap *srcmap)
@@ -302,6 +335,17 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
}
/*
+ * If the filesystem is mounted with the explicit-open mount option, we
+ * need to clear the ZONEFS_ZONE_OPEN flag if the zone transitioned to
+ * the read-only or offline condition, to avoid attempting an explicit
+ * close of the zone when the inode file is closed.
+ */
+ if ((sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) &&
+ (zone->cond == BLK_ZONE_COND_OFFLINE ||
+ zone->cond == BLK_ZONE_COND_READONLY))
+ zi->i_flags &= ~ZONEFS_ZONE_OPEN;
+
+ /*
* If error=remount-ro was specified, any error result in remounting
* the volume as read-only.
*/
@@ -315,7 +359,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
* invalid data.
*/
zonefs_update_stats(inode, data_size);
- i_size_write(inode, data_size);
+ zonefs_i_size_write(inode, data_size);
zi->i_wpoffset = data_size;
return 0;
@@ -328,7 +372,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
* eventually correct the file size and zonefs inode write pointer offset
* (which can be out of sync with the drive due to partial write failures).
*/
-static void zonefs_io_error(struct inode *inode, bool write)
+static void __zonefs_io_error(struct inode *inode, bool write)
{
struct zonefs_inode_info *zi = ZONEFS_I(inode);
struct super_block *sb = inode->i_sb;
@@ -342,8 +386,6 @@ static void zonefs_io_error(struct inode *inode, bool write)
};
int ret;
- mutex_lock(&zi->i_truncate_mutex);
-
/*
* Memory allocations in blkdev_report_zones() can trigger a memory
* reclaim which may in turn cause a recursion into zonefs as well as
@@ -359,7 +401,14 @@ static void zonefs_io_error(struct inode *inode, bool write)
zonefs_err(sb, "Get inode %lu zone information failed %d\n",
inode->i_ino, ret);
memalloc_noio_restore(noio_flag);
+}
+static void zonefs_io_error(struct inode *inode, bool write)
+{
+ struct zonefs_inode_info *zi = ZONEFS_I(inode);
+
+ mutex_lock(&zi->i_truncate_mutex);
+ __zonefs_io_error(inode, write);
mutex_unlock(&zi->i_truncate_mutex);
}
@@ -397,13 +446,27 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize)
if (isize == old_isize)
goto unlock;
- ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector,
- zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS);
- if (ret) {
- zonefs_err(inode->i_sb,
- "Zone management operation at %llu failed %d",
- zi->i_zsector, ret);
+ ret = zonefs_zone_mgmt(inode, op);
+ if (ret)
goto unlock;
+
+ /*
+ * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set,
+ * take care of open zones.
+ */
+ if (zi->i_flags & ZONEFS_ZONE_OPEN) {
+ /*
+ * Truncating a zone to EMPTY or FULL is the equivalent of
+ * closing the zone. For a truncation to 0, we need to
+ * re-open the zone to ensure new writes can be processed.
+ * For a truncation to the maximum file size, the zone is
+ * closed and writes cannot be accepted anymore, so clear
+ * the open flag.
+ */
+ if (!isize)
+ ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
+ else
+ zi->i_flags &= ~ZONEFS_ZONE_OPEN;
}
zonefs_update_stats(inode, isize);
@@ -584,7 +647,7 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
mutex_lock(&zi->i_truncate_mutex);
if (i_size_read(inode) < iocb->ki_pos + size) {
zonefs_update_stats(inode, iocb->ki_pos + size);
- i_size_write(inode, iocb->ki_pos + size);
+ zonefs_i_size_write(inode, iocb->ki_pos + size);
}
mutex_unlock(&zi->i_truncate_mutex);
}
@@ -865,8 +928,128 @@ inode_unlock:
return ret;
}
+static inline bool zonefs_file_use_exp_open(struct inode *inode, struct file *file)
+{
+ struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
+
+ if (!(sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN))
+ return false;
+
+ if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
+ return false;
+
+ if (!(file->f_mode & FMODE_WRITE))
+ return false;
+
+ return true;
+}
+
+static int zonefs_open_zone(struct inode *inode)
+{
+ struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
+ int ret = 0;
+
+ mutex_lock(&zi->i_truncate_mutex);
+
+ zi->i_wr_refcnt++;
+ if (zi->i_wr_refcnt == 1) {
+
+ if (atomic_inc_return(&sbi->s_open_zones) > sbi->s_max_open_zones) {
+ atomic_dec(&sbi->s_open_zones);
+ ret = -EBUSY;
+ goto unlock;
+ }
+
+ if (i_size_read(inode) < zi->i_max_size) {
+ ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
+ if (ret) {
+ zi->i_wr_refcnt--;
+ atomic_dec(&sbi->s_open_zones);
+ goto unlock;
+ }
+ zi->i_flags |= ZONEFS_ZONE_OPEN;
+ }
+ }
+
+unlock:
+ mutex_unlock(&zi->i_truncate_mutex);
+
+ return ret;
+}
+
+static int zonefs_file_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ ret = generic_file_open(inode, file);
+ if (ret)
+ return ret;
+
+ if (zonefs_file_use_exp_open(inode, file))
+ return zonefs_open_zone(inode);
+
+ return 0;
+}
+
+static void zonefs_close_zone(struct inode *inode)
+{
+ struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ int ret = 0;
+
+ mutex_lock(&zi->i_truncate_mutex);
+ zi->i_wr_refcnt--;
+ if (!zi->i_wr_refcnt) {
+ struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
+ struct super_block *sb = inode->i_sb;
+
+ /*
+ * If the file zone is full, it is not open anymore and we only
+ * need to decrement the open count.
+ */
+ if (!(zi->i_flags & ZONEFS_ZONE_OPEN))
+ goto dec;
+
+ ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
+ if (ret) {
+ __zonefs_io_error(inode, false);
+ /*
+ * Leaving zones explicitly open may lead to a state
+ * where most zones cannot be written (zone resources
+ * exhausted). So take preventive action by remounting
+ * read-only.
+ */
+ if (zi->i_flags & ZONEFS_ZONE_OPEN &&
+ !(sb->s_flags & SB_RDONLY)) {
+ zonefs_warn(sb, "closing zone failed, remounting filesystem read-only\n");
+ sb->s_flags |= SB_RDONLY;
+ }
+ }
+ zi->i_flags &= ~ZONEFS_ZONE_OPEN;
+dec:
+ atomic_dec(&sbi->s_open_zones);
+ }
+ mutex_unlock(&zi->i_truncate_mutex);
+}
+
+static int zonefs_file_release(struct inode *inode, struct file *file)
+{
+ /*
+ * If we explicitly open a zone we must close it again as well, but the
+ * zone management operation can fail (either due to an IO error or as
+ * the zone has gone offline or read-only). Make sure we don't fail the
+ * close(2) for user-space.
+ */
+ if (zonefs_file_use_exp_open(inode, file))
+ zonefs_close_zone(inode);
+
+ return 0;
+}
+
static const struct file_operations zonefs_file_operations = {
- .open = generic_file_open,
+ .open = zonefs_file_open,
+ .release = zonefs_file_release,
.fsync = zonefs_file_fsync,
.mmap = zonefs_file_mmap,
.llseek = zonefs_file_llseek,
@@ -890,6 +1073,7 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb)
inode_init_once(&zi->i_vnode);
mutex_init(&zi->i_truncate_mutex);
init_rwsem(&zi->i_mmap_sem);
+ zi->i_wr_refcnt = 0;
return &zi->i_vnode;
}
@@ -932,15 +1116,14 @@ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf)
fsid = le64_to_cpup((void *)sbi->s_uuid.b) ^
le64_to_cpup((void *)sbi->s_uuid.b + sizeof(u64));
- buf->f_fsid.val[0] = (u32)fsid;
- buf->f_fsid.val[1] = (u32)(fsid >> 32);
+ buf->f_fsid = u64_to_fsid(fsid);
return 0;
}
enum {
Opt_errors_ro, Opt_errors_zro, Opt_errors_zol, Opt_errors_repair,
- Opt_err,
+ Opt_explicit_open, Opt_err,
};
static const match_table_t tokens = {
@@ -948,6 +1131,7 @@ static const match_table_t tokens = {
{ Opt_errors_zro, "errors=zone-ro"},
{ Opt_errors_zol, "errors=zone-offline"},
{ Opt_errors_repair, "errors=repair"},
+ { Opt_explicit_open, "explicit-open" },
{ Opt_err, NULL}
};
@@ -984,6 +1168,9 @@ static int zonefs_parse_options(struct super_block *sb, char *options)
sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_REPAIR;
break;
+ case Opt_explicit_open:
+ sbi->s_mount_opts |= ZONEFS_MNTOPT_EXPLICIT_OPEN;
+ break;
default:
return -EINVAL;
}
@@ -1403,6 +1590,13 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_gid = GLOBAL_ROOT_GID;
sbi->s_perm = 0640;
sbi->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO;
+ sbi->s_max_open_zones = bdev_max_open_zones(sb->s_bdev);
+ atomic_set(&sbi->s_open_zones, 0);
+ if (!sbi->s_max_open_zones &&
+ sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
+ zonefs_info(sb, "No open zones limit. Ignoring explicit_open mount option\n");
+ sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN;
+ }
ret = zonefs_read_super(sb);
if (ret)
diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h
index 55b39970acb2..51141907097c 100644
--- a/fs/zonefs/zonefs.h
+++ b/fs/zonefs/zonefs.h
@@ -38,6 +38,8 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone)
return ZONEFS_ZTYPE_SEQ;
}
+#define ZONEFS_ZONE_OPEN (1 << 0)
+
/*
* In-memory inode data.
*/
@@ -74,6 +76,10 @@ struct zonefs_inode_info {
*/
struct mutex i_truncate_mutex;
struct rw_semaphore i_mmap_sem;
+
+ /* guarded by i_truncate_mutex */
+ unsigned int i_wr_refcnt;
+ unsigned int i_flags;
};
static inline struct zonefs_inode_info *ZONEFS_I(struct inode *inode)
@@ -154,6 +160,7 @@ enum zonefs_features {
#define ZONEFS_MNTOPT_ERRORS_MASK \
(ZONEFS_MNTOPT_ERRORS_RO | ZONEFS_MNTOPT_ERRORS_ZRO | \
ZONEFS_MNTOPT_ERRORS_ZOL | ZONEFS_MNTOPT_ERRORS_REPAIR)
+#define ZONEFS_MNTOPT_EXPLICIT_OPEN (1 << 4) /* Explicit open/close of zones on open/close */
/*
* In-memory Super block information.
@@ -175,6 +182,9 @@ struct zonefs_sb_info {
loff_t s_blocks;
loff_t s_used_blocks;
+
+ unsigned int s_max_open_zones;
+ atomic_t s_open_zones;
};
static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb)