summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/cache.c10
-rw-r--r--fs/9p/v9fs.c4
-rw-r--r--fs/9p/v9fs.h13
-rw-r--r--fs/9p/vfs_addr.c99
-rw-r--r--fs/9p/vfs_inode.c15
-rw-r--r--fs/Kconfig4
-rw-r--r--fs/Kconfig.binfmt13
-rw-r--r--fs/adfs/inode.c3
-rw-r--r--fs/adfs/super.c2
-rw-r--r--fs/affs/file.c6
-rw-r--r--fs/affs/super.c2
-rw-r--r--fs/afs/dir.c18
-rw-r--r--fs/afs/dynroot.c1
-rw-r--r--fs/afs/file.c69
-rw-r--r--fs/afs/inode.c32
-rw-r--r--fs/afs/internal.h29
-rw-r--r--fs/afs/super.c6
-rw-r--r--fs/afs/write.c23
-rw-r--r--fs/aio.c4
-rw-r--r--fs/befs/linuxvfs.c2
-rw-r--r--fs/bfs/file.c3
-rw-r--r--fs/bfs/inode.c2
-rw-r--r--fs/binfmt_elf.c157
-rw-r--r--fs/binfmt_elf_fdpic.c20
-rw-r--r--fs/binfmt_elf_test.c64
-rw-r--r--fs/binfmt_flat.c7
-rw-r--r--fs/btrfs/Makefile1
-rw-r--r--fs/btrfs/check-integrity.c1
-rw-r--r--fs/btrfs/ctree.h3
-rw-r--r--fs/btrfs/disk-io.c57
-rw-r--r--fs/btrfs/extent-io-tree.h4
-rw-r--r--fs/btrfs/extent_io.c42
-rw-r--r--fs/btrfs/inode.c88
-rw-r--r--fs/btrfs/reflink.c4
-rw-r--r--fs/buffer.c134
-rw-r--r--fs/cachefiles/io.c12
-rw-r--r--fs/cachefiles/namei.c33
-rw-r--r--fs/cachefiles/xattr.c2
-rw-r--r--fs/ceph/addr.c449
-rw-r--r--fs/ceph/cache.c28
-rw-r--r--fs/ceph/cache.h28
-rw-r--r--fs/ceph/caps.c23
-rw-r--r--fs/ceph/debugfs.c5
-rw-r--r--fs/ceph/dir.c17
-rw-r--r--fs/ceph/file.c85
-rw-r--r--fs/ceph/inode.c73
-rw-r--r--fs/ceph/locks.c8
-rw-r--r--fs/ceph/mds_client.c75
-rw-r--r--fs/ceph/mds_client.h15
-rw-r--r--fs/ceph/metric.c63
-rw-r--r--fs/ceph/metric.h63
-rw-r--r--fs/ceph/snap.c263
-rw-r--r--fs/ceph/strings.c1
-rw-r--r--fs/ceph/super.c8
-rw-r--r--fs/ceph/super.h27
-rw-r--r--fs/ceph/xattr.c13
-rw-r--r--fs/cifs/cifs_debug.c2
-rw-r--r--fs/cifs/cifs_swn.c6
-rw-r--r--fs/cifs/cifsfs.c40
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h34
-rw-r--r--fs/cifs/cifspdu.h14
-rw-r--r--fs/cifs/cifsproto.h3
-rw-r--r--fs/cifs/cifssmb.c13
-rw-r--r--fs/cifs/connect.c119
-rw-r--r--fs/cifs/dfs_cache.c21
-rw-r--r--fs/cifs/file.c57
-rw-r--r--fs/cifs/fscache.c19
-rw-r--r--fs/cifs/fscache.h2
-rw-r--r--fs/cifs/inode.c2
-rw-r--r--fs/cifs/link.c3
-rw-r--r--fs/cifs/misc.c2
-rw-r--r--fs/cifs/netmisc.c2
-rw-r--r--fs/cifs/ntlmssp.h2
-rw-r--r--fs/cifs/smb1ops.c2
-rw-r--r--fs/cifs/smb2glob.h11
-rw-r--r--fs/cifs/smb2misc.c24
-rw-r--r--fs/cifs/smb2ops.c281
-rw-r--r--fs/cifs/smb2pdu.c73
-rw-r--r--fs/cifs/smb2pdu.h560
-rw-r--r--fs/cifs/smb2proto.h2
-rw-r--r--fs/cifs/trace.h7
-rw-r--r--fs/cifs/transport.c9
-rw-r--r--fs/coda/file.c1
-rw-r--r--fs/coda/inode.c2
-rw-r--r--fs/compat_binfmt_elf.c2
-rw-r--r--fs/coredump.c87
-rw-r--r--fs/crypto/bio.c13
-rw-r--r--fs/crypto/crypto.c10
-rw-r--r--fs/crypto/inline_crypt.c93
-rw-r--r--fs/dax.c3
-rw-r--r--fs/dcache.c3
-rw-r--r--fs/debugfs/inode.c10
-rw-r--r--fs/direct-io.c8
-rw-r--r--fs/ecryptfs/mmap.c5
-rw-r--r--fs/ecryptfs/super.c2
-rw-r--r--fs/efs/super.c2
-rw-r--r--fs/erofs/data.c12
-rw-r--r--fs/erofs/dir.c21
-rw-r--r--fs/erofs/erofs_fs.h5
-rw-r--r--fs/erofs/inode.c4
-rw-r--r--fs/erofs/internal.h2
-rw-r--r--fs/erofs/namei.c54
-rw-r--r--fs/erofs/super.c40
-rw-r--r--fs/erofs/sysfs.c8
-rw-r--r--fs/erofs/zdata.c201
-rw-r--r--fs/erofs/zdata.h2
-rw-r--r--fs/erofs/zmap.c71
-rw-r--r--fs/exec.c39
-rw-r--r--fs/exfat/exfat_fs.h3
-rw-r--r--fs/exfat/file.c2
-rw-r--r--fs/exfat/inode.c3
-rw-r--r--fs/exfat/namei.c55
-rw-r--r--fs/exfat/super.c19
-rw-r--r--fs/ext2/ialloc.c5
-rw-r--r--fs/ext2/inode.c9
-rw-r--r--fs/ext2/super.c8
-rw-r--r--fs/ext4/balloc.c1
-rw-r--r--fs/ext4/block_validity.c26
-rw-r--r--fs/ext4/ext4.h14
-rw-r--r--fs/ext4/extents.c33
-rw-r--r--fs/ext4/fast_commit.c173
-rw-r--r--fs/ext4/fast_commit.h6
-rw-r--r--fs/ext4/file.c12
-rw-r--r--fs/ext4/inline.c9
-rw-r--r--fs/ext4/inode.c179
-rw-r--r--fs/ext4/ioctl.c22
-rw-r--r--fs/ext4/mballoc.c345
-rw-r--r--fs/ext4/namei.c29
-rw-r--r--fs/ext4/page-io.c22
-rw-r--r--fs/ext4/readpage.c10
-rw-r--r--fs/ext4/resize.c7
-rw-r--r--fs/ext4/super.c141
-rw-r--r--fs/f2fs/Kconfig7
-rw-r--r--fs/f2fs/acl.c21
-rw-r--r--fs/f2fs/checkpoint.c89
-rw-r--r--fs/f2fs/compress.c17
-rw-r--r--fs/f2fs/data.c251
-rw-r--r--fs/f2fs/debug.c25
-rw-r--r--fs/f2fs/dir.c12
-rw-r--r--fs/f2fs/f2fs.h182
-rw-r--r--fs/f2fs/file.c183
-rw-r--r--fs/f2fs/gc.c53
-rw-r--r--fs/f2fs/inline.c4
-rw-r--r--fs/f2fs/inode.c10
-rw-r--r--fs/f2fs/namei.c78
-rw-r--r--fs/f2fs/node.c121
-rw-r--r--fs/f2fs/node.h3
-rw-r--r--fs/f2fs/recovery.c35
-rw-r--r--fs/f2fs/segment.c176
-rw-r--r--fs/f2fs/segment.h5
-rw-r--r--fs/f2fs/super.c137
-rw-r--r--fs/f2fs/sysfs.c40
-rw-r--r--fs/f2fs/verity.c4
-rw-r--r--fs/f2fs/xattr.c12
-rw-r--r--fs/fat/dir.c2
-rw-r--r--fs/fat/inode.c5
-rw-r--r--fs/fcntl.c18
-rw-r--r--fs/file.c31
-rw-r--r--fs/file_table.c1
-rw-r--r--fs/freevxfs/vxfs_super.c2
-rw-r--r--fs/fs-writeback.c46
-rw-r--r--fs/fscache/Kconfig3
-rw-r--r--fs/fscache/cache.c2
-rw-r--r--fs/fscache/cookie.c4
-rw-r--r--fs/fscache/internal.h15
-rw-r--r--fs/fscache/io.c33
-rw-r--r--fs/fuse/control.c17
-rw-r--r--fs/fuse/dax.c3
-rw-r--r--fs/fuse/dev.c8
-rw-r--r--fs/fuse/dir.c2
-rw-r--r--fs/fuse/file.c33
-rw-r--r--fs/fuse/fuse_i.h2
-rw-r--r--fs/fuse/inode.c2
-rw-r--r--fs/fuse/ioctl.c2
-rw-r--r--fs/fuse/virtio_fs.c1
-rw-r--r--fs/gfs2/aops.c43
-rw-r--r--fs/gfs2/bmap.c9
-rw-r--r--fs/gfs2/file.c66
-rw-r--r--fs/gfs2/glock.c14
-rw-r--r--fs/gfs2/inode.c51
-rw-r--r--fs/gfs2/lock_dlm.c15
-rw-r--r--fs/gfs2/lops.c9
-rw-r--r--fs/gfs2/meta_io.c10
-rw-r--r--fs/gfs2/ops_fstype.c4
-rw-r--r--fs/gfs2/rgrp.c19
-rw-r--r--fs/gfs2/rgrp.h2
-rw-r--r--fs/gfs2/super.c4
-rw-r--r--fs/gfs2/sys.c2
-rw-r--r--fs/hfs/inode.c6
-rw-r--r--fs/hfs/mdb.c2
-rw-r--r--fs/hfs/super.c2
-rw-r--r--fs/hfsplus/inode.c6
-rw-r--r--fs/hfsplus/super.c2
-rw-r--r--fs/hfsplus/wrapper.c5
-rw-r--r--fs/hostfs/hostfs_kern.c5
-rw-r--r--fs/hpfs/file.c3
-rw-r--r--fs/hpfs/super.c2
-rw-r--r--fs/hugetlbfs/inode.c13
-rw-r--r--fs/inode.c2
-rw-r--r--fs/internal.h6
-rw-r--r--fs/io-wq.c120
-rw-r--r--fs/io_uring.c1591
-rw-r--r--fs/ioctl.c2
-rw-r--r--fs/iomap/buffered-io.c79
-rw-r--r--fs/iomap/direct-io.c15
-rw-r--r--fs/iomap/fiemap.c1
-rw-r--r--fs/iomap/trace.h2
-rw-r--r--fs/isofs/inode.c2
-rw-r--r--fs/jbd2/commit.c4
-rw-r--r--fs/jbd2/journal.c2
-rw-r--r--fs/jbd2/transaction.c101
-rw-r--r--fs/jffs2/build.c4
-rw-r--r--fs/jffs2/fs.c2
-rw-r--r--fs/jffs2/jffs2_fs_i.h4
-rw-r--r--fs/jffs2/scan.c6
-rw-r--r--fs/jffs2/super.c2
-rw-r--r--fs/jfs/inode.c6
-rw-r--r--fs/jfs/jfs_dmap.c7
-rw-r--r--fs/jfs/jfs_logmgr.c11
-rw-r--r--fs/jfs/jfs_metapage.c23
-rw-r--r--fs/jfs/super.c2
-rw-r--r--fs/kernfs/dir.c16
-rw-r--r--fs/kernfs/file.c9
-rw-r--r--fs/kernfs/kernfs-internal.h19
-rw-r--r--fs/ksmbd/ksmbd_netlink.h2
-rw-r--r--fs/ksmbd/misc.c40
-rw-r--r--fs/ksmbd/misc.h3
-rw-r--r--fs/ksmbd/ntlmssp.h6
-rw-r--r--fs/ksmbd/oplock.c34
-rw-r--r--fs/ksmbd/oplock.h2
-rw-r--r--fs/ksmbd/server.c2
-rw-r--r--fs/ksmbd/smb2pdu.c165
-rw-r--r--fs/ksmbd/smb2pdu.h537
-rw-r--r--fs/ksmbd/transport_rdma.c2
-rw-r--r--fs/ksmbd/transport_tcp.c4
-rw-r--r--fs/ksmbd/vfs.c7
-rw-r--r--fs/ksmbd/vfs_cache.c2
-rw-r--r--fs/ksmbd/vfs_cache.h1
-rw-r--r--fs/ksmbd/xattr.h2
-rw-r--r--fs/libfs.c15
-rw-r--r--fs/lockd/svc.c24
-rw-r--r--fs/minix/inode.c8
-rw-r--r--fs/mpage.c89
-rw-r--r--fs/namei.c22
-rw-r--r--fs/namespace.c205
-rw-r--r--fs/netfs/Makefile8
-rw-r--r--fs/netfs/buffered_read.c428
-rw-r--r--fs/netfs/internal.h50
-rw-r--r--fs/netfs/io.c657
-rw-r--r--fs/netfs/main.c20
-rw-r--r--fs/netfs/objects.c160
-rw-r--r--fs/netfs/read_helper.c1205
-rw-r--r--fs/netfs/stats.c1
-rw-r--r--fs/nfs/blocklayout/blocklayout.c26
-rw-r--r--fs/nfs/blocklayout/rpc_pipefs.c1
-rw-r--r--fs/nfs/callback.c66
-rw-r--r--fs/nfs/callback_proc.c29
-rw-r--r--fs/nfs/callback_xdr.c4
-rw-r--r--fs/nfs/client.c3
-rw-r--r--fs/nfs/delegation.c2
-rw-r--r--fs/nfs/dir.c633
-rw-r--r--fs/nfs/direct.c48
-rw-r--r--fs/nfs/file.c60
-rw-r--r--fs/nfs/filelayout/filelayout.c2
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c53
-rw-r--r--fs/nfs/fs_context.c8
-rw-r--r--fs/nfs/fscache.c61
-rw-r--r--fs/nfs/fscache.h45
-rw-r--r--fs/nfs/inode.c89
-rw-r--r--fs/nfs/internal.h35
-rw-r--r--fs/nfs/nfs2xdr.c3
-rw-r--r--fs/nfs/nfs3xdr.c30
-rw-r--r--fs/nfs/nfs42proc.c34
-rw-r--r--fs/nfs/nfs42xattr.c9
-rw-r--r--fs/nfs/nfs4_fs.h1
-rw-r--r--fs/nfs/nfs4file.c14
-rw-r--r--fs/nfs/nfs4proc.c64
-rw-r--r--fs/nfs/nfs4state.c60
-rw-r--r--fs/nfs/nfs4xdr.c7
-rw-r--r--fs/nfs/nfstrace.h221
-rw-r--r--fs/nfs/pagelist.c11
-rw-r--r--fs/nfs/pnfs.c50
-rw-r--r--fs/nfs/pnfs.h2
-rw-r--r--fs/nfs/pnfs_nfs.c8
-rw-r--r--fs/nfs/proc.c1
-rw-r--r--fs/nfs/read.c29
-rw-r--r--fs/nfs/unlink.c1
-rw-r--r--fs/nfs/write.c65
-rw-r--r--fs/nfsd/Kconfig12
-rw-r--r--fs/nfsd/Makefile3
-rw-r--r--fs/nfsd/blocklayout.c1
-rw-r--r--fs/nfsd/filecache.c32
-rw-r--r--fs/nfsd/flexfilelayout.c2
-rw-r--r--fs/nfsd/nfs2acl.c24
-rw-r--r--fs/nfsd/nfs4layouts.c2
-rw-r--r--fs/nfsd/nfs4state.c20
-rw-r--r--fs/nfsd/nfs4xdr.c10
-rw-r--r--fs/nfsd/nfscache.c33
-rw-r--r--fs/nfsd/nfsctl.c10
-rw-r--r--fs/nfsd/nfsd.h2
-rw-r--r--fs/nfsd/nfsfh.c4
-rw-r--r--fs/nfsd/nfsfh.h20
-rw-r--r--fs/nfsd/nfsproc.c2
-rw-r--r--fs/nfsd/nfssvc.c25
-rw-r--r--fs/nfsd/trace.h107
-rw-r--r--fs/nfsd/vfs.c10
-rw-r--r--fs/nfsd/vfs.h2
-rw-r--r--fs/nfsd/xdr.h2
-rw-r--r--fs/nilfs2/btnode.c23
-rw-r--r--fs/nilfs2/btnode.h1
-rw-r--r--fs/nilfs2/btree.c27
-rw-r--r--fs/nilfs2/dat.c4
-rw-r--r--fs/nilfs2/gcinode.c7
-rw-r--r--fs/nilfs2/inode.c199
-rw-r--r--fs/nilfs2/mdt.c46
-rw-r--r--fs/nilfs2/mdt.h6
-rw-r--r--fs/nilfs2/nilfs.h16
-rw-r--r--fs/nilfs2/page.c16
-rw-r--r--fs/nilfs2/page.h1
-rw-r--r--fs/nilfs2/segbuf.c63
-rw-r--r--fs/nilfs2/segment.c9
-rw-r--r--fs/nilfs2/super.c7
-rw-r--r--fs/notify/fanotify/fanotify_user.c47
-rw-r--r--fs/notify/fsnotify.c14
-rw-r--r--fs/notify/mark.c4
-rw-r--r--fs/ntfs/aops.c21
-rw-r--r--fs/ntfs/inode.c6
-rw-r--r--fs/ntfs3/fsntfs.c36
-rw-r--r--fs/ntfs3/inode.c2
-rw-r--r--fs/ntfs3/super.c2
-rw-r--r--fs/ocfs2/alloc.c2
-rw-r--r--fs/ocfs2/aops.c6
-rw-r--r--fs/ocfs2/cluster/heartbeat.c4
-rw-r--r--fs/ocfs2/cluster/nodemanager.c2
-rw-r--r--fs/ocfs2/dir.c4
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c2
-rw-r--r--fs/ocfs2/file.c13
-rw-r--r--fs/ocfs2/inode.c2
-rw-r--r--fs/ocfs2/localalloc.c6
-rw-r--r--fs/ocfs2/namei.c2
-rw-r--r--fs/ocfs2/ocfs2.h4
-rw-r--r--fs/ocfs2/quota_global.c25
-rw-r--r--fs/ocfs2/quota_local.c2
-rw-r--r--fs/ocfs2/stack_user.c18
-rw-r--r--fs/ocfs2/super.c24
-rw-r--r--fs/ocfs2/xattr.c2
-rw-r--r--fs/omfs/file.c3
-rw-r--r--fs/open.c1
-rw-r--r--fs/openpromfs/inode.c2
-rw-r--r--fs/orangefs/inode.c121
-rw-r--r--fs/orangefs/super.c2
-rw-r--r--fs/overlayfs/super.c2
-rw-r--r--fs/pipe.c4
-rw-r--r--fs/posix_acl.c10
-rw-r--r--fs/proc/array.c1
-rw-r--r--fs/proc/base.c9
-rw-r--r--fs/proc/bootconfig.c2
-rw-r--r--fs/proc/inode.c2
-rw-r--r--fs/proc/page.c1
-rw-r--r--fs/proc/vmcore.c43
-rw-r--r--fs/pstore/platform.c38
-rw-r--r--fs/pstore/ram_core.c4
-rw-r--r--fs/qnx4/inode.c2
-rw-r--r--fs/qnx6/inode.c2
-rw-r--r--fs/read_write.c1
-rw-r--r--fs/reiserfs/Kconfig10
-rw-r--r--fs/reiserfs/inode.c56
-rw-r--r--fs/reiserfs/journal.c4
-rw-r--r--fs/reiserfs/super.c4
-rw-r--r--fs/remap_range.c16
-rw-r--r--fs/romfs/super.c2
-rw-r--r--fs/seq_file.c4
-rw-r--r--fs/smbfs_common/smb2pdu.h639
-rw-r--r--fs/splice.c24
-rw-r--r--fs/squashfs/block.c11
-rw-r--r--fs/squashfs/super.c2
-rw-r--r--fs/stat.c68
-rw-r--r--fs/sysfs/file.c13
-rw-r--r--fs/sysfs/mount.c2
-rw-r--r--fs/sysv/inode.c2
-rw-r--r--fs/sysv/itree.c3
-rw-r--r--fs/ubifs/dir.c238
-rw-r--r--fs/ubifs/file.c46
-rw-r--r--fs/ubifs/io.c34
-rw-r--r--fs/ubifs/ioctl.c2
-rw-r--r--fs/ubifs/journal.c52
-rw-r--r--fs/ubifs/super.c2
-rw-r--r--fs/ubifs/sysfs.c3
-rw-r--r--fs/ubifs/ubifs.h2
-rw-r--r--fs/udf/file.c3
-rw-r--r--fs/udf/inode.c3
-rw-r--r--fs/udf/super.c5
-rw-r--r--fs/ufs/inode.c3
-rw-r--r--fs/ufs/super.c2
-rw-r--r--fs/unicode/Makefile2
-rw-r--r--fs/userfaultfd.c5
-rw-r--r--fs/vboxsf/file.c2
-rw-r--r--fs/vboxsf/super.c2
-rw-r--r--fs/vboxsf/utils.c1
-rw-r--r--fs/verity/verify.c4
-rw-r--r--fs/xattr.c6
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c28
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h1
-rw-r--r--fs/xfs/libxfs/xfs_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c36
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h8
-rw-r--r--fs/xfs/libxfs/xfs_dir2_priv.h5
-rw-r--r--fs/xfs/scrub/attr.h2
-rw-r--r--fs/xfs/xfs_aops.c7
-rw-r--r--fs/xfs/xfs_bio_io.c45
-rw-r--r--fs/xfs/xfs_bmap_item.c2
-rw-r--r--fs/xfs/xfs_buf.c58
-rw-r--r--fs/xfs/xfs_buf.h42
-rw-r--r--fs/xfs/xfs_buf_item.c5
-rw-r--r--fs/xfs/xfs_extfree_item.c2
-rw-r--r--fs/xfs/xfs_fsmap.c4
-rw-r--r--fs/xfs/xfs_fsops.c60
-rw-r--r--fs/xfs/xfs_icache.c14
-rw-r--r--fs/xfs/xfs_inode.c126
-rw-r--r--fs/xfs/xfs_inode.h2
-rw-r--r--fs/xfs/xfs_inode_item.c174
-rw-r--r--fs/xfs/xfs_inode_item.h1
-rw-r--r--fs/xfs/xfs_ioctl.c2
-rw-r--r--fs/xfs/xfs_ioctl32.c2
-rw-r--r--fs/xfs/xfs_iops.c118
-rw-r--r--fs/xfs/xfs_linux.h2
-rw-r--r--fs/xfs/xfs_log.c128
-rw-r--r--fs/xfs/xfs_log_cil.c70
-rw-r--r--fs/xfs/xfs_log_priv.h14
-rw-r--r--fs/xfs/xfs_log_recover.c56
-rw-r--r--fs/xfs/xfs_mount.c3
-rw-r--r--fs/xfs/xfs_mount.h15
-rw-r--r--fs/xfs/xfs_pnfs.c3
-rw-r--r--fs/xfs/xfs_qm.c8
-rw-r--r--fs/xfs/xfs_refcount_item.c2
-rw-r--r--fs/xfs/xfs_reflink.c5
-rw-r--r--fs/xfs/xfs_rmap_item.c2
-rw-r--r--fs/xfs/xfs_super.c3
-rw-r--r--fs/xfs/xfs_trace.h8
-rw-r--r--fs/xfs/xfs_trans.c138
-rw-r--r--fs/xfs/xfs_trans.h8
-rw-r--r--fs/xfs/xfs_trans_ail.c51
-rw-r--r--fs/xfs/xfs_trans_priv.h3
-rw-r--r--fs/zonefs/super.c62
445 files changed, 10366 insertions, 8369 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 55e108e5e133..1c8dc696d516 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -49,22 +49,20 @@ int v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses,
void v9fs_cache_inode_get_cookie(struct inode *inode)
{
- struct v9fs_inode *v9inode;
+ struct v9fs_inode *v9inode = V9FS_I(inode);
struct v9fs_session_info *v9ses;
__le32 version;
__le64 path;
if (!S_ISREG(inode->i_mode))
return;
-
- v9inode = V9FS_I(inode);
- if (WARN_ON(v9inode->fscache))
+ if (WARN_ON(v9fs_inode_cookie(v9inode)))
return;
version = cpu_to_le32(v9inode->qid.version);
path = cpu_to_le64(v9inode->qid.path);
v9ses = v9fs_inode2v9ses(inode);
- v9inode->fscache =
+ v9inode->netfs_ctx.cache =
fscache_acquire_cookie(v9fs_session_cache(v9ses),
0,
&path, sizeof(path),
@@ -72,5 +70,5 @@ void v9fs_cache_inode_get_cookie(struct inode *inode)
i_size_read(&v9inode->vfs_inode));
p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n",
- inode, v9inode->fscache);
+ inode, v9fs_inode_cookie(v9inode));
}
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 08f65c40af4f..e28ddf763b3b 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -623,9 +623,7 @@ static void v9fs_sysfs_cleanup(void)
static void v9fs_inode_init_once(void *foo)
{
struct v9fs_inode *v9inode = (struct v9fs_inode *)foo;
-#ifdef CONFIG_9P_FSCACHE
- v9inode->fscache = NULL;
-#endif
+
memset(&v9inode->qid, 0, sizeof(v9inode->qid));
inode_init_once(&v9inode->vfs_inode);
}
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index bc8b30205d36..ec0e8df3b2eb 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -9,6 +9,7 @@
#define FS_9P_V9FS_H
#include <linux/backing-dev.h>
+#include <linux/netfs.h>
/**
* enum p9_session_flags - option flags for each 9P session
@@ -108,14 +109,15 @@ struct v9fs_session_info {
#define V9FS_INO_INVALID_ATTR 0x01
struct v9fs_inode {
-#ifdef CONFIG_9P_FSCACHE
- struct fscache_cookie *fscache;
-#endif
+ struct {
+ /* These must be contiguous */
+ struct inode vfs_inode; /* the VFS's inode record */
+ struct netfs_i_context netfs_ctx; /* Netfslib context */
+ };
struct p9_qid qid;
unsigned int cache_validity;
struct p9_fid *writeback_fid;
struct mutex v_mutex;
- struct inode vfs_inode;
};
static inline struct v9fs_inode *V9FS_I(const struct inode *inode)
@@ -126,7 +128,7 @@ static inline struct v9fs_inode *V9FS_I(const struct inode *inode)
static inline struct fscache_cookie *v9fs_inode_cookie(struct v9fs_inode *v9inode)
{
#ifdef CONFIG_9P_FSCACHE
- return v9inode->fscache;
+ return netfs_i_cookie(&v9inode->vfs_inode);
#else
return NULL;
#endif
@@ -163,6 +165,7 @@ extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses,
extern const struct inode_operations v9fs_dir_inode_operations_dotl;
extern const struct inode_operations v9fs_file_inode_operations_dotl;
extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
+extern const struct netfs_request_ops v9fs_req_ops;
extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses,
struct p9_fid *fid,
struct super_block *sb, int new);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 9a10e68c5f30..501128188343 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -28,12 +28,12 @@
#include "fid.h"
/**
- * v9fs_req_issue_op - Issue a read from 9P
+ * v9fs_issue_read - Issue a read from 9P
* @subreq: The read to make
*/
-static void v9fs_req_issue_op(struct netfs_read_subrequest *subreq)
+static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
{
- struct netfs_read_request *rreq = subreq->rreq;
+ struct netfs_io_request *rreq = subreq->rreq;
struct p9_fid *fid = rreq->netfs_priv;
struct iov_iter to;
loff_t pos = subreq->start + subreq->transferred;
@@ -52,20 +52,21 @@ static void v9fs_req_issue_op(struct netfs_read_subrequest *subreq)
}
/**
- * v9fs_init_rreq - Initialise a read request
+ * v9fs_init_request - Initialise a read request
* @rreq: The read request
* @file: The file being read from
*/
-static void v9fs_init_rreq(struct netfs_read_request *rreq, struct file *file)
+static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file)
{
struct p9_fid *fid = file->private_data;
refcount_inc(&fid->count);
rreq->netfs_priv = fid;
+ return 0;
}
/**
- * v9fs_req_cleanup - Cleanup request initialized by v9fs_init_rreq
+ * v9fs_req_cleanup - Cleanup request initialized by v9fs_init_request
* @mapping: unused mapping of request to cleanup
* @priv: private data to cleanup, a fid, guaranted non-null.
*/
@@ -77,21 +78,10 @@ static void v9fs_req_cleanup(struct address_space *mapping, void *priv)
}
/**
- * v9fs_is_cache_enabled - Determine if caching is enabled for an inode
- * @inode: The inode to check
- */
-static bool v9fs_is_cache_enabled(struct inode *inode)
-{
- struct fscache_cookie *cookie = v9fs_inode_cookie(V9FS_I(inode));
-
- return fscache_cookie_enabled(cookie) && cookie->cache_priv;
-}
-
-/**
* v9fs_begin_cache_operation - Begin a cache operation for a read
* @rreq: The read request
*/
-static int v9fs_begin_cache_operation(struct netfs_read_request *rreq)
+static int v9fs_begin_cache_operation(struct netfs_io_request *rreq)
{
#ifdef CONFIG_9P_FSCACHE
struct fscache_cookie *cookie = v9fs_inode_cookie(V9FS_I(rreq->inode));
@@ -102,37 +92,14 @@ static int v9fs_begin_cache_operation(struct netfs_read_request *rreq)
#endif
}
-static const struct netfs_read_request_ops v9fs_req_ops = {
- .init_rreq = v9fs_init_rreq,
- .is_cache_enabled = v9fs_is_cache_enabled,
+const struct netfs_request_ops v9fs_req_ops = {
+ .init_request = v9fs_init_request,
.begin_cache_operation = v9fs_begin_cache_operation,
- .issue_op = v9fs_req_issue_op,
+ .issue_read = v9fs_issue_read,
.cleanup = v9fs_req_cleanup,
};
/**
- * v9fs_vfs_readpage - read an entire page in from 9P
- * @file: file being read
- * @page: structure to page
- *
- */
-static int v9fs_vfs_readpage(struct file *file, struct page *page)
-{
- struct folio *folio = page_folio(page);
-
- return netfs_readpage(file, folio, &v9fs_req_ops, NULL);
-}
-
-/**
- * v9fs_vfs_readahead - read a set of pages from 9P
- * @ractl: The readahead parameters
- */
-static void v9fs_vfs_readahead(struct readahead_control *ractl)
-{
- netfs_readahead(ractl, &v9fs_req_ops, NULL);
-}
-
-/**
* v9fs_release_page - release the private state associated with a page
* @page: The page to be released
* @gfp: The caller's allocation restrictions
@@ -158,18 +125,9 @@ static int v9fs_release_page(struct page *page, gfp_t gfp)
return 1;
}
-/**
- * v9fs_invalidate_page - Invalidate a page completely or partially
- * @page: The page to be invalidated
- * @offset: offset of the invalidated region
- * @length: length of the invalidated region
- */
-
-static void v9fs_invalidate_page(struct page *page, unsigned int offset,
- unsigned int length)
+static void v9fs_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- struct folio *folio = page_folio(page);
-
folio_wait_fscache(folio);
}
@@ -249,16 +207,8 @@ static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc)
return retval;
}
-/**
- * v9fs_launder_page - Writeback a dirty page
- * @page: The page to be cleaned up
- *
- * Returns 0 on success.
- */
-
-static int v9fs_launder_page(struct page *page)
+static int v9fs_launder_folio(struct folio *folio)
{
- struct folio *folio = page_folio(page);
int retval;
if (folio_clear_dirty_for_io(folio)) {
@@ -325,8 +275,7 @@ static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
* file. We need to do this before we get a lock on the page in case
* there's more than one writer competing for the same cache block.
*/
- retval = netfs_write_begin(filp, mapping, pos, len, flags, &folio, fsdata,
- &v9fs_req_ops, NULL);
+ retval = netfs_write_begin(filp, mapping, pos, len, flags, &folio, fsdata);
if (retval < 0)
return retval;
@@ -376,25 +325,25 @@ out:
* Mark a page as having been made dirty and thus needing writeback. We also
* need to pin the cache object to write back to.
*/
-static int v9fs_set_page_dirty(struct page *page)
+static bool v9fs_dirty_folio(struct address_space *mapping, struct folio *folio)
{
- struct v9fs_inode *v9inode = V9FS_I(page->mapping->host);
+ struct v9fs_inode *v9inode = V9FS_I(mapping->host);
- return fscache_set_page_dirty(page, v9fs_inode_cookie(v9inode));
+ return fscache_dirty_folio(mapping, folio, v9fs_inode_cookie(v9inode));
}
#else
-#define v9fs_set_page_dirty __set_page_dirty_nobuffers
+#define v9fs_dirty_folio filemap_dirty_folio
#endif
const struct address_space_operations v9fs_addr_operations = {
- .readpage = v9fs_vfs_readpage,
- .readahead = v9fs_vfs_readahead,
- .set_page_dirty = v9fs_set_page_dirty,
+ .readpage = netfs_readpage,
+ .readahead = netfs_readahead,
+ .dirty_folio = v9fs_dirty_folio,
.writepage = v9fs_vfs_writepage,
.write_begin = v9fs_write_begin,
.write_end = v9fs_write_end,
.releasepage = v9fs_release_page,
- .invalidatepage = v9fs_invalidate_page,
- .launder_page = v9fs_launder_page,
+ .invalidate_folio = v9fs_invalidate_folio,
+ .launder_folio = v9fs_launder_folio,
.direct_IO = v9fs_direct_IO,
};
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 2a10242c79c7..55367ecb9442 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -228,12 +228,9 @@ struct inode *v9fs_alloc_inode(struct super_block *sb)
{
struct v9fs_inode *v9inode;
- v9inode = kmem_cache_alloc(v9fs_inode_cache, GFP_KERNEL);
+ v9inode = alloc_inode_sb(sb, v9fs_inode_cache, GFP_KERNEL);
if (!v9inode)
return NULL;
-#ifdef CONFIG_9P_FSCACHE
- v9inode->fscache = NULL;
-#endif
v9inode->writeback_fid = NULL;
v9inode->cache_validity = 0;
mutex_init(&v9inode->v_mutex);
@@ -250,6 +247,14 @@ void v9fs_free_inode(struct inode *inode)
kmem_cache_free(v9fs_inode_cache, V9FS_I(inode));
}
+/*
+ * Set parameters for the netfs library
+ */
+static void v9fs_set_netfs_context(struct inode *inode)
+{
+ netfs_i_context_init(inode, &v9fs_req_ops);
+}
+
int v9fs_init_inode(struct v9fs_session_info *v9ses,
struct inode *inode, umode_t mode, dev_t rdev)
{
@@ -338,6 +343,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
err = -EINVAL;
goto error;
}
+
+ v9fs_set_netfs_context(inode);
error:
return err;
diff --git a/fs/Kconfig b/fs/Kconfig
index 6c7dc1387beb..30b751c7f11a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -48,7 +48,7 @@ config FS_DAX
bool "File system based Direct Access (DAX) support"
depends on MMU
depends on !(ARM || MIPS || SPARC)
- select DEV_PAGEMAP_OPS if (ZONE_DEVICE && !FS_DAX_LIMITED)
+ depends on ZONE_DEVICE || FS_DAX_LIMITED
select FS_IOMAP
select DAX
help
@@ -344,7 +344,7 @@ config LOCKD
config LOCKD_V4
bool
- depends on NFSD_V3 || NFS_V3
+ depends on NFSD || NFS_V3
depends on FILE_LOCKING
default y
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 4d5ae61580aa..21c6332fa785 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -28,6 +28,16 @@ config BINFMT_ELF
ld.so (check the file <file:Documentation/Changes> for location and
latest version).
+config BINFMT_ELF_KUNIT_TEST
+ bool "Build KUnit tests for ELF binary support" if !KUNIT_ALL_TESTS
+ depends on KUNIT=y && BINFMT_ELF=y
+ default KUNIT_ALL_TESTS
+ help
+ This builds the ELF loader KUnit tests, which try to gather
+ prior bug fixes into a regression test collection. This is really
+ only needed for debugging. Note that with CONFIG_COMPAT=y, the
+ compat_binfmt_elf KUnit test is also created.
+
config COMPAT_BINFMT_ELF
def_bool y
depends on COMPAT && BINFMT_ELF
@@ -36,6 +46,9 @@ config COMPAT_BINFMT_ELF
config ARCH_BINFMT_ELF_STATE
bool
+config ARCH_BINFMT_ELF_EXTRA_PHDRS
+ bool
+
config ARCH_HAVE_ELF_PROT
bool
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 5156821bfe6a..561bc748c04a 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -73,7 +73,8 @@ static sector_t _adfs_bmap(struct address_space *mapping, sector_t block)
}
static const struct address_space_operations adfs_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = adfs_readpage,
.writepage = adfs_writepage,
.write_begin = adfs_write_begin,
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index bdbd26e571ed..e8bfc38239cd 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -220,7 +220,7 @@ static struct kmem_cache *adfs_inode_cachep;
static struct inode *adfs_alloc_inode(struct super_block *sb)
{
struct adfs_inode_info *ei;
- ei = kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, adfs_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
return &ei->vfs_inode;
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 75ebd2b576ca..b3f81d84ff4c 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -453,7 +453,8 @@ static sector_t _affs_bmap(struct address_space *mapping, sector_t block)
}
const struct address_space_operations affs_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = affs_readpage,
.writepage = affs_writepage,
.write_begin = affs_write_begin,
@@ -834,7 +835,8 @@ err_bh:
}
const struct address_space_operations affs_aops_ofs = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = affs_readpage_ofs,
//.writepage = affs_writepage_ofs,
.write_begin = affs_write_begin_ofs,
diff --git a/fs/affs/super.c b/fs/affs/super.c
index c609005a9eaa..4c5f30a83336 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -100,7 +100,7 @@ static struct inode *affs_alloc_inode(struct super_block *sb)
{
struct affs_inode_info *i;
- i = kmem_cache_alloc(affs_inode_cachep, GFP_KERNEL);
+ i = alloc_inode_sb(sb, affs_inode_cachep, GFP_KERNEL);
if (!i)
return NULL;
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index da9b4f8577a1..932e61e28e5d 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -42,10 +42,11 @@ static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
struct dentry *old_dentry, struct inode *new_dir,
struct dentry *new_dentry, unsigned int flags);
static int afs_dir_releasepage(struct page *page, gfp_t gfp_flags);
-static void afs_dir_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length);
+static void afs_dir_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length);
-static int afs_dir_set_page_dirty(struct page *page)
+static bool afs_dir_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
{
BUG(); /* This should never happen. */
}
@@ -73,9 +74,9 @@ const struct inode_operations afs_dir_inode_operations = {
};
const struct address_space_operations afs_dir_aops = {
- .set_page_dirty = afs_dir_set_page_dirty,
+ .dirty_folio = afs_dir_dirty_folio,
.releasepage = afs_dir_releasepage,
- .invalidatepage = afs_dir_invalidatepage,
+ .invalidate_folio = afs_dir_invalidate_folio,
};
const struct dentry_operations afs_fs_dentry_operations = {
@@ -2019,13 +2020,12 @@ static int afs_dir_releasepage(struct page *subpage, gfp_t gfp_flags)
/*
* Invalidate part or all of a folio.
*/
-static void afs_dir_invalidatepage(struct page *subpage, unsigned int offset,
- unsigned int length)
+static void afs_dir_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- struct folio *folio = page_folio(subpage);
struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio));
- _enter("{%lu},%u,%u", folio_index(folio), offset, length);
+ _enter("{%lu},%zu,%zu", folio->index, offset, length);
BUG_ON(!folio_test_locked(folio));
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index db832cc931c8..f120bcb8bf73 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -76,6 +76,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
/* there shouldn't be an existing inode */
BUG_ON(!(inode->i_state & I_NEW));
+ netfs_i_context_init(inode, NULL);
inode->i_size = 0;
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
if (root) {
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 720818a7c166..26292a110a8f 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -19,13 +19,11 @@
#include "internal.h"
static int afs_file_mmap(struct file *file, struct vm_area_struct *vma);
-static int afs_readpage(struct file *file, struct page *page);
static int afs_symlink_readpage(struct file *file, struct page *page);
-static void afs_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length);
+static void afs_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length);
static int afs_releasepage(struct page *page, gfp_t gfp_flags);
-static void afs_readahead(struct readahead_control *ractl);
static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter);
static void afs_vm_open(struct vm_area_struct *area);
static void afs_vm_close(struct vm_area_struct *area);
@@ -52,12 +50,12 @@ const struct inode_operations afs_file_inode_operations = {
};
const struct address_space_operations afs_file_aops = {
- .readpage = afs_readpage,
- .readahead = afs_readahead,
- .set_page_dirty = afs_set_page_dirty,
- .launder_page = afs_launder_page,
+ .readpage = netfs_readpage,
+ .readahead = netfs_readahead,
+ .dirty_folio = afs_dirty_folio,
+ .launder_folio = afs_launder_folio,
.releasepage = afs_releasepage,
- .invalidatepage = afs_invalidatepage,
+ .invalidate_folio = afs_invalidate_folio,
.write_begin = afs_write_begin,
.write_end = afs_write_end,
.writepage = afs_writepage,
@@ -67,7 +65,7 @@ const struct address_space_operations afs_file_aops = {
const struct address_space_operations afs_symlink_aops = {
.readpage = afs_symlink_readpage,
.releasepage = afs_releasepage,
- .invalidatepage = afs_invalidatepage,
+ .invalidate_folio = afs_invalidate_folio,
};
static const struct vm_operations_struct afs_vm_ops = {
@@ -240,7 +238,7 @@ void afs_put_read(struct afs_read *req)
static void afs_fetch_data_notify(struct afs_operation *op)
{
struct afs_read *req = op->fetch.req;
- struct netfs_read_subrequest *subreq = req->subreq;
+ struct netfs_io_subrequest *subreq = req->subreq;
int error = op->error;
if (error == -ECONNABORTED)
@@ -310,7 +308,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct afs_read *req)
return afs_do_sync_operation(op);
}
-static void afs_req_issue_op(struct netfs_read_subrequest *subreq)
+static void afs_issue_read(struct netfs_io_subrequest *subreq)
{
struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode);
struct afs_read *fsreq;
@@ -359,19 +357,13 @@ static int afs_symlink_readpage(struct file *file, struct page *page)
return ret;
}
-static void afs_init_rreq(struct netfs_read_request *rreq, struct file *file)
+static int afs_init_request(struct netfs_io_request *rreq, struct file *file)
{
rreq->netfs_priv = key_get(afs_file_key(file));
+ return 0;
}
-static bool afs_is_cache_enabled(struct inode *inode)
-{
- struct fscache_cookie *cookie = afs_vnode_cache(AFS_FS_I(inode));
-
- return fscache_cookie_enabled(cookie) && cookie->cache_priv;
-}
-
-static int afs_begin_cache_operation(struct netfs_read_request *rreq)
+static int afs_begin_cache_operation(struct netfs_io_request *rreq)
{
#ifdef CONFIG_AFS_FSCACHE
struct afs_vnode *vnode = AFS_FS_I(rreq->inode);
@@ -396,27 +388,14 @@ static void afs_priv_cleanup(struct address_space *mapping, void *netfs_priv)
key_put(netfs_priv);
}
-const struct netfs_read_request_ops afs_req_ops = {
- .init_rreq = afs_init_rreq,
- .is_cache_enabled = afs_is_cache_enabled,
+const struct netfs_request_ops afs_req_ops = {
+ .init_request = afs_init_request,
.begin_cache_operation = afs_begin_cache_operation,
.check_write_begin = afs_check_write_begin,
- .issue_op = afs_req_issue_op,
+ .issue_read = afs_issue_read,
.cleanup = afs_priv_cleanup,
};
-static int afs_readpage(struct file *file, struct page *page)
-{
- struct folio *folio = page_folio(page);
-
- return netfs_readpage(file, folio, &afs_req_ops, NULL);
-}
-
-static void afs_readahead(struct readahead_control *ractl)
-{
- netfs_readahead(ractl, &afs_req_ops, NULL);
-}
-
int afs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
fscache_unpin_writeback(wbc, afs_vnode_cache(AFS_FS_I(inode)));
@@ -427,8 +406,8 @@ int afs_write_inode(struct inode *inode, struct writeback_control *wbc)
* Adjust the dirty region of the page on truncation or full invalidation,
* getting rid of the markers altogether if the region is entirely invalidated.
*/
-static void afs_invalidate_dirty(struct folio *folio, unsigned int offset,
- unsigned int length)
+static void afs_invalidate_dirty(struct folio *folio, size_t offset,
+ size_t length)
{
struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio));
unsigned long priv;
@@ -485,16 +464,14 @@ full_invalidate:
* - release a page and clean up its private data if offset is 0 (indicating
* the entire page)
*/
-static void afs_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void afs_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- struct folio *folio = page_folio(page);
-
- _enter("{%lu},%u,%u", folio_index(folio), offset, length);
+ _enter("{%lu},%zu,%zu", folio->index, offset, length);
- BUG_ON(!PageLocked(page));
+ BUG_ON(!folio_test_locked(folio));
- if (PagePrivate(page))
+ if (folio_get_private(folio))
afs_invalidate_dirty(folio, offset, length);
folio_wait_fscache(folio);
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 5964f8aee090..2fe402483ad5 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -54,6 +54,14 @@ static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *paren
}
/*
+ * Set parameters for the netfs library
+ */
+static void afs_set_netfs_context(struct afs_vnode *vnode)
+{
+ netfs_i_context_init(&vnode->vfs_inode, &afs_req_ops);
+}
+
+/*
* Initialise an inode from the vnode status.
*/
static int afs_inode_init_from_status(struct afs_operation *op,
@@ -128,6 +136,7 @@ static int afs_inode_init_from_status(struct afs_operation *op,
}
afs_set_i_size(vnode, status->size);
+ afs_set_netfs_context(vnode);
vnode->invalid_before = status->data_version;
inode_set_iversion_raw(&vnode->vfs_inode, status->data_version);
@@ -237,6 +246,7 @@ static void afs_apply_status(struct afs_operation *op,
* idea of what the size should be that's not the same as
* what's on the server.
*/
+ vnode->netfs_ctx.remote_i_size = status->size;
if (change_size) {
afs_set_i_size(vnode, status->size);
inode->i_ctime = t;
@@ -420,7 +430,7 @@ static void afs_get_inode_cache(struct afs_vnode *vnode)
struct afs_vnode_cache_aux aux;
if (vnode->status.type != AFS_FTYPE_FILE) {
- vnode->cache = NULL;
+ vnode->netfs_ctx.cache = NULL;
return;
}
@@ -430,12 +440,14 @@ static void afs_get_inode_cache(struct afs_vnode *vnode)
key.vnode_id_ext[1] = htonl(vnode->fid.vnode_hi);
afs_set_cache_aux(vnode, &aux);
- vnode->cache = fscache_acquire_cookie(
- vnode->volume->cache,
- vnode->status.type == AFS_FTYPE_FILE ? 0 : FSCACHE_ADV_SINGLE_CHUNK,
- &key, sizeof(key),
- &aux, sizeof(aux),
- vnode->status.size);
+ afs_vnode_set_cache(vnode,
+ fscache_acquire_cookie(
+ vnode->volume->cache,
+ vnode->status.type == AFS_FTYPE_FILE ?
+ 0 : FSCACHE_ADV_SINGLE_CHUNK,
+ &key, sizeof(key),
+ &aux, sizeof(aux),
+ vnode->status.size));
#endif
}
@@ -528,6 +540,7 @@ struct inode *afs_root_iget(struct super_block *sb, struct key *key)
vnode = AFS_FS_I(inode);
vnode->cb_v_break = as->volume->cb_v_break,
+ afs_set_netfs_context(vnode);
op = afs_alloc_operation(key, as->volume);
if (IS_ERR(op)) {
@@ -786,11 +799,8 @@ void afs_evict_inode(struct inode *inode)
afs_put_wb_key(wbk);
}
-#ifdef CONFIG_AFS_FSCACHE
- fscache_relinquish_cookie(vnode->cache,
+ fscache_relinquish_cookie(afs_vnode_cache(vnode),
test_bit(AFS_VNODE_DELETED, &vnode->flags));
- vnode->cache = NULL;
-#endif
afs_prune_wb_keys(vnode);
afs_put_permits(rcu_access_pointer(vnode->permit_cache));
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index b6f02321fc09..7b7ef945dc78 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -207,7 +207,7 @@ struct afs_read {
loff_t file_size; /* File size returned by server */
struct key *key; /* The key to use to reissue the read */
struct afs_vnode *vnode; /* The file being read into. */
- struct netfs_read_subrequest *subreq; /* Fscache helper read request this belongs to */
+ struct netfs_io_subrequest *subreq; /* Fscache helper read request this belongs to */
afs_dataversion_t data_version; /* Version number returned by server */
refcount_t usage;
unsigned int call_debug_id;
@@ -619,15 +619,16 @@ enum afs_lock_state {
* leak from one inode to another.
*/
struct afs_vnode {
- struct inode vfs_inode; /* the VFS's inode record */
+ struct {
+ /* These must be contiguous */
+ struct inode vfs_inode; /* the VFS's inode record */
+ struct netfs_i_context netfs_ctx; /* Netfslib context */
+ };
struct afs_volume *volume; /* volume on which vnode resides */
struct afs_fid fid; /* the file identifier for this inode */
struct afs_file_status status; /* AFS status info for this file */
afs_dataversion_t invalid_before; /* Child dentries are invalid before this */
-#ifdef CONFIG_AFS_FSCACHE
- struct fscache_cookie *cache; /* caching cookie */
-#endif
struct afs_permits __rcu *permit_cache; /* cache of permits so far obtained */
struct mutex io_lock; /* Lock for serialising I/O on this mutex */
struct rw_semaphore validate_lock; /* lock for validating this vnode */
@@ -674,12 +675,20 @@ struct afs_vnode {
static inline struct fscache_cookie *afs_vnode_cache(struct afs_vnode *vnode)
{
#ifdef CONFIG_AFS_FSCACHE
- return vnode->cache;
+ return netfs_i_cookie(&vnode->vfs_inode);
#else
return NULL;
#endif
}
+static inline void afs_vnode_set_cache(struct afs_vnode *vnode,
+ struct fscache_cookie *cookie)
+{
+#ifdef CONFIG_AFS_FSCACHE
+ vnode->netfs_ctx.cache = cookie;
+#endif
+}
+
/*
* cached security record for one user's attempt to access a vnode
*/
@@ -1063,7 +1072,7 @@ extern const struct address_space_operations afs_file_aops;
extern const struct address_space_operations afs_symlink_aops;
extern const struct inode_operations afs_file_inode_operations;
extern const struct file_operations afs_file_operations;
-extern const struct netfs_read_request_ops afs_req_ops;
+extern const struct netfs_request_ops afs_req_ops;
extern int afs_cache_wb_key(struct afs_vnode *, struct afs_file *);
extern void afs_put_wb_key(struct afs_wb_key *);
@@ -1521,9 +1530,9 @@ extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *);
* write.c
*/
#ifdef CONFIG_AFS_FSCACHE
-extern int afs_set_page_dirty(struct page *);
+bool afs_dirty_folio(struct address_space *, struct folio *);
#else
-#define afs_set_page_dirty __set_page_dirty_nobuffers
+#define afs_dirty_folio filemap_dirty_folio
#endif
extern int afs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
@@ -1537,7 +1546,7 @@ extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *);
extern int afs_fsync(struct file *, loff_t, loff_t, int);
extern vm_fault_t afs_page_mkwrite(struct vm_fault *vmf);
extern void afs_prune_wb_keys(struct afs_vnode *);
-extern int afs_launder_page(struct page *);
+int afs_launder_folio(struct folio *);
/*
* xattr.c
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 5ec9fd97eccc..1fea195b0b27 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -679,7 +679,7 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
{
struct afs_vnode *vnode;
- vnode = kmem_cache_alloc(afs_inode_cachep, GFP_KERNEL);
+ vnode = alloc_inode_sb(sb, afs_inode_cachep, GFP_KERNEL);
if (!vnode)
return NULL;
@@ -688,13 +688,11 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
/* Reset anything that shouldn't leak from one inode to the next. */
memset(&vnode->fid, 0, sizeof(vnode->fid));
memset(&vnode->status, 0, sizeof(vnode->status));
+ afs_vnode_set_cache(vnode, NULL);
vnode->volume = NULL;
vnode->lock_key = NULL;
vnode->permit_cache = NULL;
-#ifdef CONFIG_AFS_FSCACHE
- vnode->cache = NULL;
-#endif
vnode->flags = 1 << AFS_VNODE_UNSET;
vnode->lock_state = AFS_VNODE_LOCK_NONE;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index f447c902318d..4763132ca57e 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -22,9 +22,10 @@ static void afs_write_to_cache(struct afs_vnode *vnode, loff_t start, size_t len
* Mark a page as having been made dirty and thus needing writeback. We also
* need to pin the cache object to write back to.
*/
-int afs_set_page_dirty(struct page *page)
+bool afs_dirty_folio(struct address_space *mapping, struct folio *folio)
{
- return fscache_set_page_dirty(page, afs_vnode_cache(AFS_FS_I(page->mapping->host)));
+ return fscache_dirty_folio(mapping, folio,
+ afs_vnode_cache(AFS_FS_I(mapping->host)));
}
static void afs_folio_start_fscache(bool caching, struct folio *folio)
{
@@ -59,8 +60,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
* file. We need to do this before we get a lock on the page in case
* there's more than one writer competing for the same cache block.
*/
- ret = netfs_write_begin(file, mapping, pos, len, flags, &folio, fsdata,
- &afs_req_ops, NULL);
+ ret = netfs_write_begin(file, mapping, pos, len, flags, &folio, fsdata);
if (ret < 0)
return ret;
@@ -354,9 +354,10 @@ static const struct afs_operation_ops afs_store_data_operation = {
static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t pos,
bool laundering)
{
+ struct netfs_i_context *ictx = &vnode->netfs_ctx;
struct afs_operation *op;
struct afs_wb_key *wbk = NULL;
- loff_t size = iov_iter_count(iter), i_size;
+ loff_t size = iov_iter_count(iter);
int ret = -ENOKEY;
_enter("%s{%llx:%llu.%u},%llx,%llx",
@@ -378,15 +379,13 @@ static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t
return -ENOMEM;
}
- i_size = i_size_read(&vnode->vfs_inode);
-
afs_op_set_vnode(op, 0, vnode);
op->file[0].dv_delta = 1;
op->file[0].modification = true;
op->store.write_iter = iter;
op->store.pos = pos;
op->store.size = size;
- op->store.i_size = max(pos + size, i_size);
+ op->store.i_size = max(pos + size, ictx->remote_i_size);
op->store.laundering = laundering;
op->mtime = vnode->vfs_inode.i_mtime;
op->flags |= AFS_OPERATION_UNINTR;
@@ -617,8 +616,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping,
_debug("write discard %x @%llx [%llx]", len, start, i_size);
/* The dirty region was entirely beyond the EOF. */
- fscache_clear_page_bits(afs_vnode_cache(vnode),
- mapping, start, len, caching);
+ fscache_clear_page_bits(mapping, start, len, caching);
afs_pages_written_back(vnode, start, len);
ret = 0;
}
@@ -979,9 +977,8 @@ void afs_prune_wb_keys(struct afs_vnode *vnode)
/*
* Clean up a page during invalidation.
*/
-int afs_launder_page(struct page *subpage)
+int afs_launder_folio(struct folio *folio)
{
- struct folio *folio = page_folio(subpage);
struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio));
struct iov_iter iter;
struct bio_vec bv[1];
@@ -989,7 +986,7 @@ int afs_launder_page(struct page *subpage)
unsigned int f, t;
int ret = 0;
- _enter("{%lx}", folio_index(folio));
+ _enter("{%lx}", folio->index);
priv = (unsigned long)folio_get_private(folio);
if (folio_clear_dirty_for_io(folio)) {
diff --git a/fs/aio.c b/fs/aio.c
index 4ceba13a7db0..3c249b938632 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -478,7 +478,7 @@ out:
#endif
static const struct address_space_operations aio_ctx_aops = {
- .set_page_dirty = __set_page_dirty_no_writeback,
+ .dirty_folio = noop_dirty_folio,
#if IS_ENABLED(CONFIG_MIGRATION)
.migratepage = aio_migratepage,
#endif
@@ -1478,7 +1478,6 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
req->ki_flags = iocb_flags(req->ki_filp);
if (iocb->aio_flags & IOCB_FLAG_RESFD)
req->ki_flags |= IOCB_EVENTFD;
- req->ki_hint = ki_hint_validate(file_write_hint(req->ki_filp));
if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
/*
* If the IOCB_FLAG_IOPRIO flag of aio_flags is set, then
@@ -1553,7 +1552,6 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb,
file = req->ki_filp;
if (unlikely(!(file->f_mode & FMODE_READ)))
return -EBADF;
- ret = -EINVAL;
if (unlikely(!file->f_op->read_iter))
return -EINVAL;
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index c1ba13d19024..b4b3567ac655 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -277,7 +277,7 @@ befs_alloc_inode(struct super_block *sb)
{
struct befs_inode_info *bi;
- bi = kmem_cache_alloc(befs_inode_cachep, GFP_KERNEL);
+ bi = alloc_inode_sb(sb, befs_inode_cachep, GFP_KERNEL);
if (!bi)
return NULL;
return &bi->vfs_inode;
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 7f8544abf636..03139344568f 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -188,7 +188,8 @@ static sector_t bfs_bmap(struct address_space *mapping, sector_t block)
}
const struct address_space_operations bfs_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = bfs_readpage,
.writepage = bfs_writepage,
.write_begin = bfs_write_begin,
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index fd691e4815c5..1926bec2c850 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -239,7 +239,7 @@ static struct kmem_cache *bfs_inode_cachep;
static struct inode *bfs_alloc_inode(struct super_block *sb)
{
struct bfs_inode_info *bi;
- bi = kmem_cache_alloc(bfs_inode_cachep, GFP_KERNEL);
+ bi = alloc_inode_sb(sb, bfs_inode_cachep, GFP_KERNEL);
if (!bi)
return NULL;
return &bi->vfs_inode;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index d61543fbd652..63c7ebb0da89 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -93,7 +93,7 @@ static int elf_core_dump(struct coredump_params *cprm);
#define ELF_CORE_EFLAGS 0
#endif
-#define ELF_PAGESTART(_v) ((_v) & ~(unsigned long)(ELF_MIN_ALIGN-1))
+#define ELF_PAGESTART(_v) ((_v) & ~(int)(ELF_MIN_ALIGN-1))
#define ELF_PAGEOFFSET(_v) ((_v) & (ELF_MIN_ALIGN-1))
#define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1))
@@ -101,8 +101,10 @@ static struct linux_binfmt elf_format = {
.module = THIS_MODULE,
.load_binary = load_elf_binary,
.load_shlib = load_elf_library,
+#ifdef CONFIG_COREDUMP
.core_dump = elf_core_dump,
.min_coredump = ELF_EXEC_PAGESIZE,
+#endif
};
#define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE))
@@ -170,8 +172,8 @@ static int padzero(unsigned long elf_bss)
static int
create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
- unsigned long load_addr, unsigned long interp_load_addr,
- unsigned long e_entry)
+ unsigned long interp_load_addr,
+ unsigned long e_entry, unsigned long phdr_addr)
{
struct mm_struct *mm = current->mm;
unsigned long p = bprm->p;
@@ -257,7 +259,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP);
NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE);
NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC);
- NEW_AUX_ENT(AT_PHDR, load_addr + exec->e_phoff);
+ NEW_AUX_ENT(AT_PHDR, phdr_addr);
NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr));
NEW_AUX_ENT(AT_PHNUM, exec->e_phnum);
NEW_AUX_ENT(AT_BASE, interp_load_addr);
@@ -399,22 +401,21 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
return(map_addr);
}
-static unsigned long total_mapping_size(const struct elf_phdr *cmds, int nr)
+static unsigned long total_mapping_size(const struct elf_phdr *phdr, int nr)
{
- int i, first_idx = -1, last_idx = -1;
+ elf_addr_t min_addr = -1;
+ elf_addr_t max_addr = 0;
+ bool pt_load = false;
+ int i;
for (i = 0; i < nr; i++) {
- if (cmds[i].p_type == PT_LOAD) {
- last_idx = i;
- if (first_idx == -1)
- first_idx = i;
+ if (phdr[i].p_type == PT_LOAD) {
+ min_addr = min(min_addr, ELF_PAGESTART(phdr[i].p_vaddr));
+ max_addr = max(max_addr, phdr[i].p_vaddr + phdr[i].p_memsz);
+ pt_load = true;
}
}
- if (first_idx == -1)
- return 0;
-
- return cmds[last_idx].p_vaddr + cmds[last_idx].p_memsz -
- ELF_PAGESTART(cmds[first_idx].p_vaddr);
+ return pt_load ? (max_addr - min_addr) : 0;
}
static int elf_read(struct file *file, void *buf, size_t len, loff_t pos)
@@ -823,8 +824,8 @@ static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr,
static int load_elf_binary(struct linux_binprm *bprm)
{
struct file *interpreter = NULL; /* to shut gcc up */
- unsigned long load_addr = 0, load_bias = 0;
- int load_addr_set = 0;
+ unsigned long load_bias = 0, phdr_addr = 0;
+ int first_pt_load = 1;
unsigned long error;
struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
struct elf_phdr *elf_property_phdata = NULL;
@@ -1074,12 +1075,12 @@ out_free_interp:
vaddr = elf_ppnt->p_vaddr;
/*
- * The first time through the loop, load_addr_set is false:
+ * The first time through the loop, first_pt_load is true:
* layout will be calculated. Once set, use MAP_FIXED since
* we know we've already safely mapped the entire region with
* MAP_FIXED_NOREPLACE in the once-per-binary logic following.
*/
- if (load_addr_set) {
+ if (!first_pt_load) {
elf_flags |= MAP_FIXED;
} else if (elf_ex->e_type == ET_EXEC) {
/*
@@ -1116,11 +1117,11 @@ out_free_interp:
* independently randomized mmap region (0 load_bias
* without MAP_FIXED nor MAP_FIXED_NOREPLACE).
*/
- alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum);
- if (interpreter || alignment > ELF_MIN_ALIGN) {
+ if (interpreter) {
load_bias = ELF_ET_DYN_BASE;
if (current->flags & PF_RANDOMIZE)
load_bias += arch_mmap_rnd();
+ alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum);
if (alignment)
load_bias &= ~(alignment - 1);
elf_flags |= MAP_FIXED_NOREPLACE;
@@ -1170,16 +1171,25 @@ out_free_interp:
goto out_free_dentry;
}
- if (!load_addr_set) {
- load_addr_set = 1;
- load_addr = (elf_ppnt->p_vaddr - elf_ppnt->p_offset);
+ if (first_pt_load) {
+ first_pt_load = 0;
if (elf_ex->e_type == ET_DYN) {
load_bias += error -
ELF_PAGESTART(load_bias + vaddr);
- load_addr += load_bias;
reloc_func_desc = load_bias;
}
}
+
+ /*
+ * Figure out which segment in the file contains the Program
+ * Header table, and map to the associated memory address.
+ */
+ if (elf_ppnt->p_offset <= elf_ex->e_phoff &&
+ elf_ex->e_phoff < elf_ppnt->p_offset + elf_ppnt->p_filesz) {
+ phdr_addr = elf_ex->e_phoff - elf_ppnt->p_offset +
+ elf_ppnt->p_vaddr;
+ }
+
k = elf_ppnt->p_vaddr;
if ((elf_ppnt->p_flags & PF_X) && k < start_code)
start_code = k;
@@ -1215,6 +1225,7 @@ out_free_interp:
}
e_entry = elf_ex->e_entry + load_bias;
+ phdr_addr += load_bias;
elf_bss += load_bias;
elf_brk += load_bias;
start_code += load_bias;
@@ -1278,8 +1289,8 @@ out_free_interp:
goto out;
#endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
- retval = create_elf_tables(bprm, elf_ex,
- load_addr, interp_load_addr, e_entry);
+ retval = create_elf_tables(bprm, elf_ex, interp_load_addr,
+ e_entry, phdr_addr);
if (retval < 0)
goto out;
@@ -1630,17 +1641,16 @@ static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata,
* long file_ofs
* followed by COUNT filenames in ASCII: "FILE1" NUL "FILE2" NUL...
*/
-static int fill_files_note(struct memelfnote *note)
+static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm)
{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
unsigned count, size, names_ofs, remaining, n;
user_long_t *data;
user_long_t *start_end_ofs;
char *name_base, *name_curpos;
+ int i;
/* *Estimated* file count and total data size needed */
- count = mm->map_count;
+ count = cprm->vma_count;
if (count > UINT_MAX / 64)
return -EINVAL;
size = count * 64;
@@ -1662,11 +1672,12 @@ static int fill_files_note(struct memelfnote *note)
name_base = name_curpos = ((char *)data) + names_ofs;
remaining = size - names_ofs;
count = 0;
- for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct core_vma_metadata *m = &cprm->vma_meta[i];
struct file *file;
const char *filename;
- file = vma->vm_file;
+ file = m->file;
if (!file)
continue;
filename = file_path(file, name_curpos, remaining);
@@ -1686,9 +1697,9 @@ static int fill_files_note(struct memelfnote *note)
memmove(name_curpos, filename, n);
name_curpos += n;
- *start_end_ofs++ = vma->vm_start;
- *start_end_ofs++ = vma->vm_end;
- *start_end_ofs++ = vma->vm_pgoff;
+ *start_end_ofs++ = m->start;
+ *start_end_ofs++ = m->end;
+ *start_end_ofs++ = m->pgoff;
count++;
}
@@ -1699,7 +1710,7 @@ static int fill_files_note(struct memelfnote *note)
* Count usually is less than mm->map_count,
* we need to move filenames down.
*/
- n = mm->map_count - count;
+ n = cprm->vma_count - count;
if (n != 0) {
unsigned shift_bytes = n * 3 * sizeof(data[0]);
memmove(name_base - shift_bytes, name_base,
@@ -1755,9 +1766,9 @@ static void do_thread_regset_writeback(struct task_struct *task,
static int fill_thread_core_info(struct elf_thread_core_info *t,
const struct user_regset_view *view,
- long signr, size_t *total)
+ long signr, struct elf_note_info *info)
{
- unsigned int i;
+ unsigned int note_iter, view_iter;
/*
* NT_PRSTATUS is the one special case, because the regset data
@@ -1771,17 +1782,17 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
fill_note(&t->notes[0], "CORE", NT_PRSTATUS,
PRSTATUS_SIZE, &t->prstatus);
- *total += notesize(&t->notes[0]);
+ info->size += notesize(&t->notes[0]);
do_thread_regset_writeback(t->task, &view->regsets[0]);
/*
* Each other regset might generate a note too. For each regset
- * that has no core_note_type or is inactive, we leave t->notes[i]
- * all zero and we'll know to skip writing it later.
+ * that has no core_note_type or is inactive, skip it.
*/
- for (i = 1; i < view->n; ++i) {
- const struct user_regset *regset = &view->regsets[i];
+ note_iter = 1;
+ for (view_iter = 1; view_iter < view->n; ++view_iter) {
+ const struct user_regset *regset = &view->regsets[view_iter];
int note_type = regset->core_note_type;
bool is_fpreg = note_type == NT_PRFPREG;
void *data;
@@ -1797,13 +1808,17 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
if (ret < 0)
continue;
+ if (WARN_ON_ONCE(note_iter >= info->thread_notes))
+ break;
+
if (is_fpreg)
SET_PR_FPVALID(&t->prstatus);
- fill_note(&t->notes[i], is_fpreg ? "CORE" : "LINUX",
+ fill_note(&t->notes[note_iter], is_fpreg ? "CORE" : "LINUX",
note_type, ret, data);
- *total += notesize(&t->notes[i]);
+ info->size += notesize(&t->notes[note_iter]);
+ note_iter++;
}
return 1;
@@ -1811,7 +1826,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
- const kernel_siginfo_t *siginfo, struct pt_regs *regs)
+ struct coredump_params *cprm)
{
struct task_struct *dump_task = current;
const struct user_regset_view *view = task_user_regset_view(dump_task);
@@ -1883,7 +1898,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
* Now fill in each thread's information.
*/
for (t = info->thread; t != NULL; t = t->next)
- if (!fill_thread_core_info(t, view, siginfo->si_signo, &info->size))
+ if (!fill_thread_core_info(t, view, cprm->siginfo->si_signo, info))
return 0;
/*
@@ -1892,13 +1907,13 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
fill_psinfo(psinfo, dump_task->group_leader, dump_task->mm);
info->size += notesize(&info->psinfo);
- fill_siginfo_note(&info->signote, &info->csigdata, siginfo);
+ fill_siginfo_note(&info->signote, &info->csigdata, cprm->siginfo);
info->size += notesize(&info->signote);
fill_auxv_note(&info->auxv, current->mm);
info->size += notesize(&info->auxv);
- if (fill_files_note(&info->files) == 0)
+ if (fill_files_note(&info->files, cprm) == 0)
info->size += notesize(&info->files);
return 1;
@@ -2040,7 +2055,7 @@ static int elf_note_info_init(struct elf_note_info *info)
static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
- const kernel_siginfo_t *siginfo, struct pt_regs *regs)
+ struct coredump_params *cprm)
{
struct core_thread *ct;
struct elf_thread_status *ets;
@@ -2061,13 +2076,13 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
list_for_each_entry(ets, &info->thread_list, list) {
int sz;
- sz = elf_dump_thread_status(siginfo->si_signo, ets);
+ sz = elf_dump_thread_status(cprm->siginfo->si_signo, ets);
info->thread_status_size += sz;
}
/* now collect the dump for the current */
memset(info->prstatus, 0, sizeof(*info->prstatus));
- fill_prstatus(&info->prstatus->common, current, siginfo->si_signo);
- elf_core_copy_regs(&info->prstatus->pr_reg, regs);
+ fill_prstatus(&info->prstatus->common, current, cprm->siginfo->si_signo);
+ elf_core_copy_regs(&info->prstatus->pr_reg, cprm->regs);
/* Set up header */
fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS);
@@ -2083,18 +2098,18 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
fill_note(info->notes + 1, "CORE", NT_PRPSINFO,
sizeof(*info->psinfo), info->psinfo);
- fill_siginfo_note(info->notes + 2, &info->csigdata, siginfo);
+ fill_siginfo_note(info->notes + 2, &info->csigdata, cprm->siginfo);
fill_auxv_note(info->notes + 3, current->mm);
info->numnote = 4;
- if (fill_files_note(info->notes + info->numnote) == 0) {
+ if (fill_files_note(info->notes + info->numnote, cprm) == 0) {
info->notes_files = info->notes + info->numnote;
info->numnote++;
}
/* Try to dump the FPU. */
- info->prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs,
- info->fpu);
+ info->prstatus->pr_fpvalid =
+ elf_core_copy_task_fpregs(current, cprm->regs, info->fpu);
if (info->prstatus->pr_fpvalid)
fill_note(info->notes + info->numnote++,
"CORE", NT_PRFPREG, sizeof(*info->fpu), info->fpu);
@@ -2180,8 +2195,7 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
static int elf_core_dump(struct coredump_params *cprm)
{
int has_dumped = 0;
- int vma_count, segs, i;
- size_t vma_data_size;
+ int segs, i;
struct elfhdr elf;
loff_t offset = 0, dataoff;
struct elf_note_info info = { };
@@ -2189,16 +2203,12 @@ static int elf_core_dump(struct coredump_params *cprm)
struct elf_shdr *shdr4extnum = NULL;
Elf_Half e_phnum;
elf_addr_t e_shoff;
- struct core_vma_metadata *vma_meta;
-
- if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size))
- return 0;
/*
* The number of segs are recored into ELF header as 16bit value.
* Please check DEFAULT_MAX_MAP_COUNT definition when you modify here.
*/
- segs = vma_count + elf_core_extra_phdrs();
+ segs = cprm->vma_count + elf_core_extra_phdrs();
/* for notes section */
segs++;
@@ -2212,7 +2222,7 @@ static int elf_core_dump(struct coredump_params *cprm)
* Collect all the non-memory information about the process for the
* notes. This also sets up the file header.
*/
- if (!fill_note_info(&elf, e_phnum, &info, cprm->siginfo, cprm->regs))
+ if (!fill_note_info(&elf, e_phnum, &info, cprm))
goto end_coredump;
has_dumped = 1;
@@ -2237,7 +2247,7 @@ static int elf_core_dump(struct coredump_params *cprm)
dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
- offset += vma_data_size;
+ offset += cprm->vma_data_size;
offset += elf_core_extra_data_size();
e_shoff = offset;
@@ -2257,8 +2267,8 @@ static int elf_core_dump(struct coredump_params *cprm)
goto end_coredump;
/* Write program headers for segments dump */
- for (i = 0; i < vma_count; i++) {
- struct core_vma_metadata *meta = vma_meta + i;
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct core_vma_metadata *meta = cprm->vma_meta + i;
struct elf_phdr phdr;
phdr.p_type = PT_LOAD;
@@ -2295,8 +2305,8 @@ static int elf_core_dump(struct coredump_params *cprm)
/* Align to page */
dump_skip_to(cprm, dataoff);
- for (i = 0; i < vma_count; i++) {
- struct core_vma_metadata *meta = vma_meta + i;
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct core_vma_metadata *meta = cprm->vma_meta + i;
if (!dump_user_range(cprm, meta->start, meta->dump_size))
goto end_coredump;
@@ -2313,7 +2323,6 @@ static int elf_core_dump(struct coredump_params *cprm)
end_coredump:
free_note_info(&info);
kfree(shdr4extnum);
- kvfree(vma_meta);
kfree(phdr4note);
return has_dumped;
}
@@ -2335,3 +2344,7 @@ static void __exit exit_elf_binfmt(void)
core_initcall(init_elf_binfmt);
module_exit(exit_elf_binfmt);
MODULE_LICENSE("GPL");
+
+#ifdef CONFIG_BINFMT_ELF_KUNIT_TEST
+#include "binfmt_elf_test.c"
+#endif
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index c6f588dc4a9d..08d0c8797828 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -83,8 +83,8 @@ static struct linux_binfmt elf_fdpic_format = {
.load_binary = load_elf_fdpic_binary,
#ifdef CONFIG_ELF_CORE
.core_dump = elf_fdpic_core_dump,
-#endif
.min_coredump = ELF_EXEC_PAGESIZE,
+#endif
};
static int __init init_elf_fdpic_binfmt(void)
@@ -1465,7 +1465,7 @@ static bool elf_fdpic_dump_segments(struct coredump_params *cprm,
static int elf_fdpic_core_dump(struct coredump_params *cprm)
{
int has_dumped = 0;
- int vma_count, segs;
+ int segs;
int i;
struct elfhdr *elf = NULL;
loff_t offset = 0, dataoff;
@@ -1480,8 +1480,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
elf_addr_t e_shoff;
struct core_thread *ct;
struct elf_thread_status *tmp;
- struct core_vma_metadata *vma_meta = NULL;
- size_t vma_data_size;
/* alloc memory for large data structures: too large to be on stack */
elf = kmalloc(sizeof(*elf), GFP_KERNEL);
@@ -1491,9 +1489,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
if (!psinfo)
goto end_coredump;
- if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size))
- goto end_coredump;
-
for (ct = current->signal->core_state->dumper.next;
ct; ct = ct->next) {
tmp = elf_dump_thread_status(cprm->siginfo->si_signo,
@@ -1513,7 +1508,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
tmp->next = thread_list;
thread_list = tmp;
- segs = vma_count + elf_core_extra_phdrs();
+ segs = cprm->vma_count + elf_core_extra_phdrs();
/* for notes section */
segs++;
@@ -1558,7 +1553,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
/* Page-align dumped data */
dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
- offset += vma_data_size;
+ offset += cprm->vma_data_size;
offset += elf_core_extra_data_size();
e_shoff = offset;
@@ -1578,8 +1573,8 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
goto end_coredump;
/* write program headers for segments dump */
- for (i = 0; i < vma_count; i++) {
- struct core_vma_metadata *meta = vma_meta + i;
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct core_vma_metadata *meta = cprm->vma_meta + i;
struct elf_phdr phdr;
size_t sz;
@@ -1628,7 +1623,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
dump_skip_to(cprm, dataoff);
- if (!elf_fdpic_dump_segments(cprm, vma_meta, vma_count))
+ if (!elf_fdpic_dump_segments(cprm, cprm->vma_meta, cprm->vma_count))
goto end_coredump;
if (!elf_core_write_extra_data(cprm))
@@ -1652,7 +1647,6 @@ end_coredump:
thread_list = thread_list->next;
kfree(tmp);
}
- kvfree(vma_meta);
kfree(phdr4note);
kfree(elf);
kfree(psinfo);
diff --git a/fs/binfmt_elf_test.c b/fs/binfmt_elf_test.c
new file mode 100644
index 000000000000..11d734fec366
--- /dev/null
+++ b/fs/binfmt_elf_test.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <kunit/test.h>
+
+static void total_mapping_size_test(struct kunit *test)
+{
+ struct elf_phdr empty[] = {
+ { .p_type = PT_LOAD, .p_vaddr = 0, .p_memsz = 0, },
+ { .p_type = PT_INTERP, .p_vaddr = 10, .p_memsz = 999999, },
+ };
+ /*
+ * readelf -lW /bin/mount | grep '^ .*0x0' | awk '{print "\t\t{ .p_type = PT_" \
+ * $1 ", .p_vaddr = " $3 ", .p_memsz = " $6 ", },"}'
+ */
+ struct elf_phdr mount[] = {
+ { .p_type = PT_PHDR, .p_vaddr = 0x00000040, .p_memsz = 0x0002d8, },
+ { .p_type = PT_INTERP, .p_vaddr = 0x00000318, .p_memsz = 0x00001c, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x00000000, .p_memsz = 0x0033a8, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x00004000, .p_memsz = 0x005c91, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x0000a000, .p_memsz = 0x0022f8, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x0000d330, .p_memsz = 0x000d40, },
+ { .p_type = PT_DYNAMIC, .p_vaddr = 0x0000d928, .p_memsz = 0x000200, },
+ { .p_type = PT_NOTE, .p_vaddr = 0x00000338, .p_memsz = 0x000030, },
+ { .p_type = PT_NOTE, .p_vaddr = 0x00000368, .p_memsz = 0x000044, },
+ { .p_type = PT_GNU_PROPERTY, .p_vaddr = 0x00000338, .p_memsz = 0x000030, },
+ { .p_type = PT_GNU_EH_FRAME, .p_vaddr = 0x0000b490, .p_memsz = 0x0001ec, },
+ { .p_type = PT_GNU_STACK, .p_vaddr = 0x00000000, .p_memsz = 0x000000, },
+ { .p_type = PT_GNU_RELRO, .p_vaddr = 0x0000d330, .p_memsz = 0x000cd0, },
+ };
+ size_t mount_size = 0xE070;
+ /* https://lore.kernel.org/linux-fsdevel/YfF18Dy85mCntXrx@fractal.localdomain */
+ struct elf_phdr unordered[] = {
+ { .p_type = PT_LOAD, .p_vaddr = 0x00000000, .p_memsz = 0x0033a8, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x0000d330, .p_memsz = 0x000d40, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x00004000, .p_memsz = 0x005c91, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x0000a000, .p_memsz = 0x0022f8, },
+ };
+
+ /* No headers, no size. */
+ KUNIT_EXPECT_EQ(test, total_mapping_size(NULL, 0), 0);
+ KUNIT_EXPECT_EQ(test, total_mapping_size(empty, 0), 0);
+ /* Empty headers, no size. */
+ KUNIT_EXPECT_EQ(test, total_mapping_size(empty, 1), 0);
+ /* No PT_LOAD headers, no size. */
+ KUNIT_EXPECT_EQ(test, total_mapping_size(&empty[1], 1), 0);
+ /* Empty PT_LOAD and non-PT_LOAD headers, no size. */
+ KUNIT_EXPECT_EQ(test, total_mapping_size(empty, 2), 0);
+
+ /* Normal set of PT_LOADS, and expected size. */
+ KUNIT_EXPECT_EQ(test, total_mapping_size(mount, ARRAY_SIZE(mount)), mount_size);
+ /* Unordered PT_LOADs result in same size. */
+ KUNIT_EXPECT_EQ(test, total_mapping_size(unordered, ARRAY_SIZE(unordered)), mount_size);
+}
+
+static struct kunit_case binfmt_elf_test_cases[] = {
+ KUNIT_CASE(total_mapping_size_test),
+ {},
+};
+
+static struct kunit_suite binfmt_elf_test_suite = {
+ .name = KBUILD_MODNAME,
+ .test_cases = binfmt_elf_test_cases,
+};
+
+kunit_test_suite(binfmt_elf_test_suite);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 5d776f80ee50..626898150011 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -37,6 +37,7 @@
#include <linux/flat.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
+#include <linux/coredump.h>
#include <asm/byteorder.h>
#include <asm/unaligned.h>
@@ -97,13 +98,17 @@ static int load_flat_shared_library(int id, struct lib_info *p);
#endif
static int load_flat_binary(struct linux_binprm *);
+#ifdef CONFIG_COREDUMP
static int flat_core_dump(struct coredump_params *cprm);
+#endif
static struct linux_binfmt flat_format = {
.module = THIS_MODULE,
.load_binary = load_flat_binary,
+#ifdef CONFIG_COREDUMP
.core_dump = flat_core_dump,
.min_coredump = PAGE_SIZE
+#endif
};
/****************************************************************************/
@@ -112,12 +117,14 @@ static struct linux_binfmt flat_format = {
* Currently only a stub-function.
*/
+#ifdef CONFIG_COREDUMP
static int flat_core_dump(struct coredump_params *cprm)
{
pr_warn("Process %s:%d received signr %d and should have core dumped\n",
current->comm, current->pid, cprm->siginfo->si_signo);
return 1;
}
+#endif
/****************************************************************************/
/*
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 4188ba3fd8c3..99f9995670ea 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -17,6 +17,7 @@ subdir-ccflags-y += $(condflags)
subdir-ccflags-y += -Wno-missing-field-initializers
subdir-ccflags-y += -Wno-sign-compare
subdir-ccflags-y += -Wno-type-limits
+subdir-ccflags-y += -Wno-shift-negative-value
obj-$(CONFIG_BTRFS_FS) := btrfs.o
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 7e9f90fa0388..abac86a75840 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -78,7 +78,6 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/mutex.h>
-#include <linux/genhd.h>
#include <linux/blkdev.h>
#include <linux/mm.h>
#include <linux/string.h>
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 604a4d54cf0d..077c95e9baa5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3946,5 +3946,8 @@ static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
#define PageOrdered(page) PagePrivate2(page)
#define SetPageOrdered(page) SetPagePrivate2(page)
#define ClearPageOrdered(page) ClearPagePrivate2(page)
+#define folio_test_ordered(folio) folio_test_private_2(folio)
+#define folio_set_ordered(folio) folio_set_private_2(folio)
+#define folio_clear_ordered(folio) folio_clear_private_2(folio)
#endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 20e70eb88465..ed8e288cc369 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1013,41 +1013,40 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
return try_release_extent_buffer(page);
}
-static void btree_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void btree_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
struct extent_io_tree *tree;
- tree = &BTRFS_I(page->mapping->host)->io_tree;
- extent_invalidatepage(tree, page, offset);
- btree_releasepage(page, GFP_NOFS);
- if (PagePrivate(page)) {
- btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
- "page private not zero on page %llu",
- (unsigned long long)page_offset(page));
- detach_page_private(page);
+ tree = &BTRFS_I(folio->mapping->host)->io_tree;
+ extent_invalidate_folio(tree, folio, offset);
+ btree_releasepage(&folio->page, GFP_NOFS);
+ if (folio_get_private(folio)) {
+ btrfs_warn(BTRFS_I(folio->mapping->host)->root->fs_info,
+ "folio private not zero on folio %llu",
+ (unsigned long long)folio_pos(folio));
+ folio_detach_private(folio);
}
}
-static int btree_set_page_dirty(struct page *page)
-{
#ifdef DEBUG
- struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+static bool btree_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
struct btrfs_subpage *subpage;
struct extent_buffer *eb;
int cur_bit = 0;
- u64 page_start = page_offset(page);
+ u64 page_start = folio_pos(folio);
if (fs_info->sectorsize == PAGE_SIZE) {
- BUG_ON(!PagePrivate(page));
- eb = (struct extent_buffer *)page->private;
+ eb = folio_get_private(folio);
BUG_ON(!eb);
BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
BUG_ON(!atomic_read(&eb->refs));
btrfs_assert_tree_write_locked(eb);
- return __set_page_dirty_nobuffers(page);
+ return filemap_dirty_folio(mapping, folio);
}
- ASSERT(PagePrivate(page) && page->private);
- subpage = (struct btrfs_subpage *)page->private;
+ subpage = folio_get_private(folio);
ASSERT(subpage->dirty_bitmap);
while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) {
@@ -1073,18 +1072,20 @@ static int btree_set_page_dirty(struct page *page)
cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits);
}
-#endif
- return __set_page_dirty_nobuffers(page);
+ return filemap_dirty_folio(mapping, folio);
}
+#else
+#define btree_dirty_folio filemap_dirty_folio
+#endif
static const struct address_space_operations btree_aops = {
.writepages = btree_writepages,
.releasepage = btree_releasepage,
- .invalidatepage = btree_invalidatepage,
+ .invalidate_folio = btree_invalidate_folio,
#ifdef CONFIG_MIGRATION
.migratepage = btree_migratepage,
#endif
- .set_page_dirty = btree_set_page_dirty,
+ .dirty_folio = btree_dirty_folio,
};
struct extent_buffer *btrfs_find_create_tree_block(
@@ -4128,8 +4129,9 @@ static int write_dev_supers(struct btrfs_device *device,
* to do I/O, so we don't lose the ability to do integrity
* checking.
*/
- bio = bio_alloc(GFP_NOFS, 1);
- bio_set_dev(bio, device->bdev);
+ bio = bio_alloc(device->bdev, 1,
+ REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO,
+ GFP_NOFS);
bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
bio->bi_private = device;
bio->bi_end_io = btrfs_end_super_write;
@@ -4141,7 +4143,6 @@ static int write_dev_supers(struct btrfs_device *device,
* go down lazy and there's a short window where the on-disk
* copies might still contain the older version.
*/
- bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO;
if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
bio->bi_opf |= REQ_FUA;
@@ -4253,10 +4254,8 @@ static void write_dev_flush(struct btrfs_device *device)
return;
#endif
- bio_reset(bio);
+ bio_reset(bio, device->bdev, REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
bio->bi_end_io = btrfs_end_empty_barrier;
- bio_set_dev(bio, device->bdev);
- bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
init_completion(&device->flush_wait);
bio->bi_private = &device->flush_wait;
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index 04083ee5ae6e..c3eb52dbe61c 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -244,8 +244,8 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, u32 bits);
int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, u32 bits);
-int extent_invalidatepage(struct extent_io_tree *tree,
- struct page *page, unsigned long offset);
+int extent_invalidate_folio(struct extent_io_tree *tree,
+ struct folio *folio, size_t offset);
bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
u64 *end, u64 max_bytes,
struct extent_state **cached_state);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index aa43f7811754..33c19f51d79b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1507,17 +1507,17 @@ void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
{
+ struct address_space *mapping = inode->i_mapping;
unsigned long index = start >> PAGE_SHIFT;
unsigned long end_index = end >> PAGE_SHIFT;
- struct page *page;
+ struct folio *folio;
while (index <= end_index) {
- page = find_get_page(inode->i_mapping, index);
- BUG_ON(!page); /* Pages should be in the extent_io_tree */
- __set_page_dirty_nobuffers(page);
- account_page_redirty(page);
- put_page(page);
- index++;
+ folio = filemap_get_folio(mapping, index);
+ filemap_dirty_folio(mapping, folio);
+ folio_account_redirty(folio);
+ index += folio_nr_pages(folio);
+ folio_put(folio);
}
}
@@ -3152,7 +3152,7 @@ struct bio *btrfs_bio_alloc(unsigned int nr_iovecs)
struct bio *bio;
ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS);
- bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset);
+ bio = bio_alloc_bioset(NULL, nr_iovecs, 0, GFP_NOFS, &btrfs_bioset);
btrfs_bio_init(btrfs_bio(bio));
return bio;
}
@@ -3163,7 +3163,7 @@ struct bio *btrfs_bio_clone(struct bio *bio)
struct bio *new;
/* Bio allocation backed by a bioset does not fail */
- new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset);
+ new = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOFS, &btrfs_bioset);
bbio = btrfs_bio(new);
btrfs_bio_init(bbio);
bbio->iter = bio->bi_iter;
@@ -3178,7 +3178,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
/* this will never fail when it's backed by a bioset */
- bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
+ bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset);
ASSERT(bio);
bbio = btrfs_bio(bio);
@@ -3330,7 +3330,6 @@ static int alloc_new_bio(struct btrfs_inode *inode,
bio_ctrl->bio_flags = bio_flags;
bio->bi_end_io = end_io_func;
bio->bi_private = &inode->io_tree;
- bio->bi_write_hint = inode->vfs_inode.i_write_hint;
bio->bi_opf = opf;
ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
if (ret < 0)
@@ -4068,6 +4067,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
static int __extent_writepage(struct page *page, struct writeback_control *wbc,
struct extent_page_data *epd)
{
+ struct folio *folio = page_folio(page);
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
const u64 page_start = page_offset(page);
@@ -4088,8 +4088,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
pg_offset = offset_in_page(i_size);
if (page->index > end_index ||
(page->index == end_index && !pg_offset)) {
- page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
- unlock_page(page);
+ folio_invalidate(folio, 0, folio_size(folio));
+ folio_unlock(folio);
return 0;
}
@@ -5239,17 +5239,17 @@ void extent_readahead(struct readahead_control *rac)
}
/*
- * basic invalidatepage code, this waits on any locked or writeback
- * ranges corresponding to the page, and then deletes any extent state
+ * basic invalidate_folio code, this waits on any locked or writeback
+ * ranges corresponding to the folio, and then deletes any extent state
* records from the tree
*/
-int extent_invalidatepage(struct extent_io_tree *tree,
- struct page *page, unsigned long offset)
+int extent_invalidate_folio(struct extent_io_tree *tree,
+ struct folio *folio, size_t offset)
{
struct extent_state *cached_state = NULL;
- u64 start = page_offset(page);
- u64 end = start + PAGE_SIZE - 1;
- size_t blocksize = page->mapping->host->i_sb->s_blocksize;
+ u64 start = folio_pos(folio);
+ u64 end = start + folio_size(folio) - 1;
+ size_t blocksize = folio->mapping->host->i_sb->s_blocksize;
/* This function is only called for the btree inode */
ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
@@ -5259,7 +5259,7 @@ int extent_invalidatepage(struct extent_io_tree *tree,
return 0;
lock_extent_bits(tree, start, end, &cached_state);
- wait_on_page_writeback(page);
+ folio_wait_writeback(folio);
/*
* Currently for btree io tree, only EXTENT_LOCKED is utilized,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 673b9259f6c0..95c499b8424e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5074,16 +5074,17 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
}
/*
- * While truncating the inode pages during eviction, we get the VFS calling
- * btrfs_invalidatepage() against each page of the inode. This is slow because
- * the calls to btrfs_invalidatepage() result in a huge amount of calls to
- * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
- * extent_state structures over and over, wasting lots of time.
+ * While truncating the inode pages during eviction, we get the VFS
+ * calling btrfs_invalidate_folio() against each folio of the inode. This
+ * is slow because the calls to btrfs_invalidate_folio() result in a
+ * huge amount of calls to lock_extent_bits() and clear_extent_bit(),
+ * which keep merging and splitting extent_state structures over and over,
+ * wasting lots of time.
*
- * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
- * those expensive operations on a per page basis and do only the ordered io
- * finishing, while we release here the extent_map and extent_state structures,
- * without the excessive merging and splitting.
+ * Therefore if the inode is being evicted, let btrfs_invalidate_folio()
+ * skip all those expensive operations on a per folio basis and do only
+ * the ordered io finishing, while we release here the extent_map and
+ * extent_state structures, without the excessive merging and splitting.
*/
static void evict_inode_truncate_pages(struct inode *inode)
{
@@ -5149,7 +5150,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
* If still has DELALLOC flag, the extent didn't reach disk,
* and its reserved space won't be freed by delayed_ref.
* So we need to free its reserved space here.
- * (Refer to comment in btrfs_invalidatepage, case 2)
+ * (Refer to comment in btrfs_invalidate_folio, case 2)
*
* Note, end is the bytenr of last byte, so we need + 1 here.
*/
@@ -8170,8 +8171,8 @@ static void btrfs_readahead(struct readahead_control *rac)
}
/*
- * For releasepage() and invalidatepage() we have a race window where
- * end_page_writeback() is called but the subpage spinlock is not yet released.
+ * For releasepage() and invalidate_folio() we have a race window where
+ * folio_end_writeback() is called but the subpage spinlock is not yet released.
* If we continue to release/invalidate the page, we could cause use-after-free
* for subpage spinlock. So this function is to spin and wait for subpage
* spinlock.
@@ -8247,48 +8248,48 @@ static int btrfs_migratepage(struct address_space *mapping,
}
#endif
-static void btrfs_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_io_tree *tree = &inode->io_tree;
struct extent_state *cached_state = NULL;
- u64 page_start = page_offset(page);
- u64 page_end = page_start + PAGE_SIZE - 1;
+ u64 page_start = folio_pos(folio);
+ u64 page_end = page_start + folio_size(folio) - 1;
u64 cur;
int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
/*
- * We have page locked so no new ordered extent can be created on this
- * page, nor bio can be submitted for this page.
+ * We have folio locked so no new ordered extent can be created on this
+ * page, nor bio can be submitted for this folio.
*
- * But already submitted bio can still be finished on this page.
- * Furthermore, endio function won't skip page which has Ordered
+ * But already submitted bio can still be finished on this folio.
+ * Furthermore, endio function won't skip folio which has Ordered
* (Private2) already cleared, so it's possible for endio and
- * invalidatepage to do the same ordered extent accounting twice
- * on one page.
+ * invalidate_folio to do the same ordered extent accounting twice
+ * on one folio.
*
* So here we wait for any submitted bios to finish, so that we won't
- * do double ordered extent accounting on the same page.
+ * do double ordered extent accounting on the same folio.
*/
- wait_on_page_writeback(page);
- wait_subpage_spinlock(page);
+ folio_wait_writeback(folio);
+ wait_subpage_spinlock(&folio->page);
/*
* For subpage case, we have call sites like
* btrfs_punch_hole_lock_range() which passes range not aligned to
* sectorsize.
- * If the range doesn't cover the full page, we don't need to and
- * shouldn't clear page extent mapped, as page->private can still
+ * If the range doesn't cover the full folio, we don't need to and
+ * shouldn't clear page extent mapped, as folio->private can still
* record subpage dirty bits for other part of the range.
*
- * For cases that can invalidate the full even the range doesn't
- * cover the full page, like invalidating the last page, we're
+ * For cases that invalidate the full folio even the range doesn't
+ * cover the full folio, like invalidating the last folio, we're
* still safe to wait for ordered extent to finish.
*/
- if (!(offset == 0 && length == PAGE_SIZE)) {
- btrfs_releasepage(page, GFP_NOFS);
+ if (!(offset == 0 && length == folio_size(folio))) {
+ btrfs_releasepage(&folio->page, GFP_NOFS);
return;
}
@@ -8329,7 +8330,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
page_end);
ASSERT(range_end + 1 - cur < U32_MAX);
range_len = range_end + 1 - cur;
- if (!btrfs_page_test_ordered(fs_info, page, cur, range_len)) {
+ if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) {
/*
* If Ordered (Private2) is cleared, it means endio has
* already been executed for the range.
@@ -8339,7 +8340,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
delete_states = false;
goto next;
}
- btrfs_page_clear_ordered(fs_info, page, cur, range_len);
+ btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len);
/*
* IO on this page will never be started, so we need to account
@@ -8409,11 +8410,11 @@ next:
* should not have Ordered (Private2) anymore, or the above iteration
* did something wrong.
*/
- ASSERT(!PageOrdered(page));
- btrfs_page_clear_checked(fs_info, page, page_offset(page), PAGE_SIZE);
+ ASSERT(!folio_test_ordered(folio));
+ btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio));
if (!inode_evicting)
- __btrfs_releasepage(page, GFP_NOFS);
- clear_page_extent_mapped(page);
+ __btrfs_releasepage(&folio->page, GFP_NOFS);
+ clear_page_extent_mapped(&folio->page);
}
/*
@@ -8811,7 +8812,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
struct btrfs_inode *ei;
struct inode *inode;
- ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
@@ -10048,11 +10049,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
min_size, actual_len, alloc_hint, trans);
}
-static int btrfs_set_page_dirty(struct page *page)
-{
- return __set_page_dirty_nobuffers(page);
-}
-
static int btrfs_permission(struct user_namespace *mnt_userns,
struct inode *inode, int mask)
{
@@ -11366,12 +11362,12 @@ static const struct address_space_operations btrfs_aops = {
.writepages = btrfs_writepages,
.readahead = btrfs_readahead,
.direct_IO = noop_direct_IO,
- .invalidatepage = btrfs_invalidatepage,
+ .invalidate_folio = btrfs_invalidate_folio,
.releasepage = btrfs_releasepage,
#ifdef CONFIG_MIGRATION
.migratepage = btrfs_migratepage,
#endif
- .set_page_dirty = btrfs_set_page_dirty,
+ .dirty_folio = filemap_dirty_folio,
.error_remove_page = generic_error_remove_page,
.swap_activate = btrfs_swap_activate,
.swap_deactivate = btrfs_swap_deactivate,
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 04a88bfe4fcf..998e3f180d90 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -645,7 +645,7 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
int ret;
/*
- * Lock destination range to serialize with concurrent readpages() and
+ * Lock destination range to serialize with concurrent readahead() and
* source range to serialize with relocation.
*/
btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
@@ -739,7 +739,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
}
/*
- * Lock destination range to serialize with concurrent readpages() and
+ * Lock destination range to serialize with concurrent readahead() and
* source range to serialize with relocation.
*/
btrfs_double_extent_lock(src, off, inode, destoff, len);
diff --git a/fs/buffer.c b/fs/buffer.c
index 8e112b6bd371..2b5561ae5d0b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -53,7 +53,7 @@
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
- enum rw_hint hint, struct writeback_control *wbc);
+ struct writeback_control *wbc);
#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
@@ -613,17 +613,14 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
* FIXME: may need to call ->reservepage here as well. That's rather up to the
* address_space though.
*/
-int __set_page_dirty_buffers(struct page *page)
+bool block_dirty_folio(struct address_space *mapping, struct folio *folio)
{
- int newly_dirty;
- struct address_space *mapping = page_mapping(page);
-
- if (unlikely(!mapping))
- return !TestSetPageDirty(page);
+ struct buffer_head *head;
+ bool newly_dirty;
spin_lock(&mapping->private_lock);
- if (page_has_buffers(page)) {
- struct buffer_head *head = page_buffers(page);
+ head = folio_buffers(folio);
+ if (head) {
struct buffer_head *bh = head;
do {
@@ -635,21 +632,21 @@ int __set_page_dirty_buffers(struct page *page)
* Lock out page's memcg migration to keep PageDirty
* synchronized with per-memcg dirty page counters.
*/
- lock_page_memcg(page);
- newly_dirty = !TestSetPageDirty(page);
+ folio_memcg_lock(folio);
+ newly_dirty = !folio_test_set_dirty(folio);
spin_unlock(&mapping->private_lock);
if (newly_dirty)
- __set_page_dirty(page, mapping, 1);
+ __folio_mark_dirty(folio, mapping, 1);
- unlock_page_memcg(page);
+ folio_memcg_unlock(folio);
if (newly_dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
return newly_dirty;
}
-EXPORT_SYMBOL(__set_page_dirty_buffers);
+EXPORT_SYMBOL(block_dirty_folio);
/*
* Write out and wait upon a list of buffers.
@@ -1235,16 +1232,18 @@ static void bh_lru_install(struct buffer_head *bh)
int i;
check_irqs_on();
+ bh_lru_lock();
+
/*
* the refcount of buffer_head in bh_lru prevents dropping the
* attached page(i.e., try_to_free_buffers) so it could cause
* failing page migration.
* Skip putting upcoming bh into bh_lru until migration is done.
*/
- if (lru_cache_disabled())
+ if (lru_cache_disabled()) {
+ bh_lru_unlock();
return;
-
- bh_lru_lock();
+ }
b = this_cpu_ptr(&bh_lrus);
for (i = 0; i < BH_LRU_SIZE; i++) {
@@ -1482,41 +1481,40 @@ static void discard_buffer(struct buffer_head * bh)
}
/**
- * block_invalidatepage - invalidate part or all of a buffer-backed page
- *
- * @page: the page which is affected
+ * block_invalidate_folio - Invalidate part or all of a buffer-backed folio.
+ * @folio: The folio which is affected.
* @offset: start of the range to invalidate
* @length: length of the range to invalidate
*
- * block_invalidatepage() is called when all or part of the page has become
+ * block_invalidate_folio() is called when all or part of the folio has been
* invalidated by a truncate operation.
*
- * block_invalidatepage() does not have to release all buffers, but it must
+ * block_invalidate_folio() does not have to release all buffers, but it must
* ensure that no dirty buffer is left outside @offset and that no I/O
* is underway against any of the blocks which are outside the truncation
* point. Because the caller is about to free (and possibly reuse) those
* blocks on-disk.
*/
-void block_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+void block_invalidate_folio(struct folio *folio, size_t offset, size_t length)
{
struct buffer_head *head, *bh, *next;
- unsigned int curr_off = 0;
- unsigned int stop = length + offset;
+ size_t curr_off = 0;
+ size_t stop = length + offset;
- BUG_ON(!PageLocked(page));
- if (!page_has_buffers(page))
- goto out;
+ BUG_ON(!folio_test_locked(folio));
/*
* Check for overflow
*/
- BUG_ON(stop > PAGE_SIZE || stop < length);
+ BUG_ON(stop > folio_size(folio) || stop < length);
+
+ head = folio_buffers(folio);
+ if (!head)
+ return;
- head = page_buffers(page);
bh = head;
do {
- unsigned int next_off = curr_off + bh->b_size;
+ size_t next_off = curr_off + bh->b_size;
next = bh->b_this_page;
/*
@@ -1535,21 +1533,21 @@ void block_invalidatepage(struct page *page, unsigned int offset,
} while (bh != head);
/*
- * We release buffers only if the entire page is being invalidated.
+ * We release buffers only if the entire folio is being invalidated.
* The get_block cached value has been unconditionally invalidated,
* so real IO is not possible anymore.
*/
- if (length == PAGE_SIZE)
- try_to_release_page(page, 0);
+ if (length == folio_size(folio))
+ filemap_release_folio(folio, 0);
out:
return;
}
-EXPORT_SYMBOL(block_invalidatepage);
+EXPORT_SYMBOL(block_invalidate_folio);
/*
* We attach and possibly dirty the buffers atomically wrt
- * __set_page_dirty_buffers() via private_lock. try_to_free_buffers
+ * block_dirty_folio() via private_lock. try_to_free_buffers
* is already excluded via the page lock.
*/
void create_empty_buffers(struct page *page,
@@ -1724,12 +1722,12 @@ int __block_write_full_page(struct inode *inode, struct page *page,
(1 << BH_Dirty)|(1 << BH_Uptodate));
/*
- * Be very careful. We have no exclusion from __set_page_dirty_buffers
+ * Be very careful. We have no exclusion from block_dirty_folio
* here, and the (potentially unmapped) buffers may become dirty at
* any time. If a buffer becomes dirty here after we've inspected it
* then we just miss that fact, and the page stays dirty.
*
- * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
+ * Buffers outside i_size may be dirtied by block_dirty_folio;
* handle that here by just cleaning them.
*/
@@ -1806,8 +1804,7 @@ int __block_write_full_page(struct inode *inode, struct page *page,
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
- submit_bh_wbc(REQ_OP_WRITE, write_flags, bh,
- inode->i_write_hint, wbc);
+ submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc);
nr_underway++;
}
bh = next;
@@ -1861,8 +1858,7 @@ recover:
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
clear_buffer_dirty(bh);
- submit_bh_wbc(REQ_OP_WRITE, write_flags, bh,
- inode->i_write_hint, wbc);
+ submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc);
nr_underway++;
}
bh = next;
@@ -2206,29 +2202,27 @@ int generic_write_end(struct file *file, struct address_space *mapping,
EXPORT_SYMBOL(generic_write_end);
/*
- * block_is_partially_uptodate checks whether buffers within a page are
+ * block_is_partially_uptodate checks whether buffers within a folio are
* uptodate or not.
*
- * Returns true if all buffers which correspond to a file portion
- * we want to read are uptodate.
+ * Returns true if all buffers which correspond to the specified part
+ * of the folio are uptodate.
*/
-int block_is_partially_uptodate(struct page *page, unsigned long from,
- unsigned long count)
+bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
{
unsigned block_start, block_end, blocksize;
unsigned to;
struct buffer_head *bh, *head;
- int ret = 1;
+ bool ret = true;
- if (!page_has_buffers(page))
- return 0;
-
- head = page_buffers(page);
+ head = folio_buffers(folio);
+ if (!head)
+ return false;
blocksize = head->b_size;
- to = min_t(unsigned, PAGE_SIZE - from, count);
+ to = min_t(unsigned, folio_size(folio) - from, count);
to = from + to;
- if (from < blocksize && to > PAGE_SIZE - blocksize)
- return 0;
+ if (from < blocksize && to > folio_size(folio) - blocksize)
+ return false;
bh = head;
block_start = 0;
@@ -2236,7 +2230,7 @@ int block_is_partially_uptodate(struct page *page, unsigned long from,
block_end = block_start + blocksize;
if (block_end > from && block_start < to) {
if (!buffer_uptodate(bh)) {
- ret = 0;
+ ret = false;
break;
}
if (block_end >= to)
@@ -2358,8 +2352,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size)
if (err)
goto out;
- err = pagecache_write_begin(NULL, mapping, size, 0,
- AOP_FLAG_CONT_EXPAND, &page, &fsdata);
+ err = pagecache_write_begin(NULL, mapping, size, 0, 0, &page, &fsdata);
if (err)
goto out;
@@ -3008,7 +3001,7 @@ static void end_bio_bh_io_sync(struct bio *bio)
}
static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
- enum rw_hint write_hint, struct writeback_control *wbc)
+ struct writeback_control *wbc)
{
struct bio *bio;
@@ -3024,13 +3017,16 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
clear_buffer_write_io_error(bh);
- bio = bio_alloc(GFP_NOIO, 1);
+ if (buffer_meta(bh))
+ op_flags |= REQ_META;
+ if (buffer_prio(bh))
+ op_flags |= REQ_PRIO;
+
+ bio = bio_alloc(bh->b_bdev, 1, op | op_flags, GFP_NOIO);
fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
- bio_set_dev(bio, bh->b_bdev);
- bio->bi_write_hint = write_hint;
bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
BUG_ON(bio->bi_iter.bi_size != bh->b_size);
@@ -3038,12 +3034,6 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
bio->bi_end_io = end_bio_bh_io_sync;
bio->bi_private = bh;
- if (buffer_meta(bh))
- op_flags |= REQ_META;
- if (buffer_prio(bh))
- op_flags |= REQ_PRIO;
- bio_set_op_attrs(bio, op, op_flags);
-
/* Take care of bh's that straddle the end of the device */
guard_bio_eod(bio);
@@ -3058,7 +3048,7 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
int submit_bh(int op, int op_flags, struct buffer_head *bh)
{
- return submit_bh_wbc(op, op_flags, bh, 0, NULL);
+ return submit_bh_wbc(op, op_flags, bh, NULL);
}
EXPORT_SYMBOL(submit_bh);
@@ -3185,7 +3175,7 @@ EXPORT_SYMBOL(sync_dirty_buffer);
*
* The same applies to regular filesystem pages: if all the buffers are
* clean then we set the page clean and proceed. To do that, we require
- * total exclusion from __set_page_dirty_buffers(). That is obtained with
+ * total exclusion from block_dirty_folio(). That is obtained with
* private_lock.
*
* try_to_free_buffers() is non-blocking.
@@ -3252,7 +3242,7 @@ int try_to_free_buffers(struct page *page)
* the page also.
*
* private_lock must be held over this entire operation in order
- * to synchronise against __set_page_dirty_buffers and prevent the
+ * to synchronise against block_dirty_folio and prevent the
* dirty bit from being lost.
*/
if (ret)
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index 753986ea1583..9dc81e781f2b 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -138,7 +138,6 @@ static int cachefiles_read(struct netfs_cache_resources *cres,
ki->iocb.ki_filp = file;
ki->iocb.ki_pos = start_pos + skipped;
ki->iocb.ki_flags = IOCB_DIRECT;
- ki->iocb.ki_hint = ki_hint_validate(file_write_hint(file));
ki->iocb.ki_ioprio = get_current_ioprio();
ki->skipped = skipped;
ki->object = object;
@@ -313,7 +312,6 @@ static int cachefiles_write(struct netfs_cache_resources *cres,
ki->iocb.ki_filp = file;
ki->iocb.ki_pos = start_pos;
ki->iocb.ki_flags = IOCB_DIRECT | IOCB_WRITE;
- ki->iocb.ki_hint = ki_hint_validate(file_write_hint(file));
ki->iocb.ki_ioprio = get_current_ioprio();
ki->object = object;
ki->inval_counter = cres->inval_counter;
@@ -382,18 +380,18 @@ presubmission_error:
* Prepare a read operation, shortening it to a cached/uncached
* boundary as appropriate.
*/
-static enum netfs_read_source cachefiles_prepare_read(struct netfs_read_subrequest *subreq,
+static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *subreq,
loff_t i_size)
{
enum cachefiles_prepare_read_trace why;
- struct netfs_read_request *rreq = subreq->rreq;
+ struct netfs_io_request *rreq = subreq->rreq;
struct netfs_cache_resources *cres = &rreq->cache_resources;
struct cachefiles_object *object;
struct cachefiles_cache *cache;
struct fscache_cookie *cookie = fscache_cres_cookie(cres);
const struct cred *saved_cred;
struct file *file = cachefiles_cres_file(cres);
- enum netfs_read_source ret = NETFS_DOWNLOAD_FROM_SERVER;
+ enum netfs_io_source ret = NETFS_DOWNLOAD_FROM_SERVER;
loff_t off, to;
ino_t ino = file ? file_inode(file)->i_ino : 0;
@@ -406,7 +404,7 @@ static enum netfs_read_source cachefiles_prepare_read(struct netfs_read_subreque
}
if (test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) {
- __set_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags);
+ __set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
why = cachefiles_trace_read_no_data;
goto out_no_object;
}
@@ -475,7 +473,7 @@ static enum netfs_read_source cachefiles_prepare_read(struct netfs_read_subreque
goto out;
download_and_store:
- __set_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags);
+ __set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
out:
cachefiles_end_secure(cache, saved_cred);
out_no_object:
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index f256c8aff7bb..ca9f3e4ec4b3 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -57,6 +57,16 @@ static void __cachefiles_unmark_inode_in_use(struct cachefiles_object *object,
trace_cachefiles_mark_inactive(object, inode);
}
+static void cachefiles_do_unmark_inode_in_use(struct cachefiles_object *object,
+ struct dentry *dentry)
+{
+ struct inode *inode = d_backing_inode(dentry);
+
+ inode_lock(inode);
+ __cachefiles_unmark_inode_in_use(object, dentry);
+ inode_unlock(inode);
+}
+
/*
* Unmark a backing inode and tell cachefilesd that there's something that can
* be culled.
@@ -68,9 +78,7 @@ void cachefiles_unmark_inode_in_use(struct cachefiles_object *object,
struct inode *inode = file_inode(file);
if (inode) {
- inode_lock(inode);
- __cachefiles_unmark_inode_in_use(object, file->f_path.dentry);
- inode_unlock(inode);
+ cachefiles_do_unmark_inode_in_use(object, file->f_path.dentry);
if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) {
atomic_long_add(inode->i_blocks, &cache->b_released);
@@ -484,7 +492,7 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object)
object, d_backing_inode(path.dentry), ret,
cachefiles_trace_trunc_error);
file = ERR_PTR(ret);
- goto out_dput;
+ goto out_unuse;
}
}
@@ -494,15 +502,20 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object)
trace_cachefiles_vfs_error(object, d_backing_inode(path.dentry),
PTR_ERR(file),
cachefiles_trace_open_error);
- goto out_dput;
+ goto out_unuse;
}
if (unlikely(!file->f_op->read_iter) ||
unlikely(!file->f_op->write_iter)) {
fput(file);
pr_notice("Cache does not support read_iter and write_iter\n");
file = ERR_PTR(-EINVAL);
+ goto out_unuse;
}
+ goto out_dput;
+
+out_unuse:
+ cachefiles_do_unmark_inode_in_use(object, path.dentry);
out_dput:
dput(path.dentry);
out:
@@ -590,14 +603,16 @@ static bool cachefiles_open_file(struct cachefiles_object *object,
check_failed:
fscache_cookie_lookup_negative(object->cookie);
cachefiles_unmark_inode_in_use(object, file);
- if (ret == -ESTALE) {
- fput(file);
- dput(dentry);
+ fput(file);
+ dput(dentry);
+ if (ret == -ESTALE)
return cachefiles_create_file(object);
- }
+ return false;
+
error_fput:
fput(file);
error:
+ cachefiles_do_unmark_inode_in_use(object, dentry);
dput(dentry);
return false;
}
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 35465109d9c4..00b087c14995 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -203,7 +203,7 @@ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume)
if (!buf)
return false;
buf->reserved = cpu_to_be32(0);
- memcpy(buf->data, p, len);
+ memcpy(buf->data, p, volume->vcookie->coherency_len);
ret = cachefiles_inject_write_error();
if (ret == 0)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index c98e5238a1b6..aa25bffd4823 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -76,18 +76,17 @@ static inline struct ceph_snap_context *page_snap_context(struct page *page)
* Dirty a page. Optimistically adjust accounting, on the assumption
* that we won't race with invalidate. If we do, readjust.
*/
-static int ceph_set_page_dirty(struct page *page)
+static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
{
- struct address_space *mapping = page->mapping;
struct inode *inode;
struct ceph_inode_info *ci;
struct ceph_snap_context *snapc;
- if (PageDirty(page)) {
- dout("%p set_page_dirty %p idx %lu -- already dirty\n",
- mapping->host, page, page->index);
- BUG_ON(!PagePrivate(page));
- return 0;
+ if (folio_test_dirty(folio)) {
+ dout("%p dirty_folio %p idx %lu -- already dirty\n",
+ mapping->host, folio, folio->index);
+ BUG_ON(!folio_get_private(folio));
+ return false;
}
inode = mapping->host;
@@ -111,56 +110,56 @@ static int ceph_set_page_dirty(struct page *page)
if (ci->i_wrbuffer_ref == 0)
ihold(inode);
++ci->i_wrbuffer_ref;
- dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
+ dout("%p dirty_folio %p idx %lu head %d/%d -> %d/%d "
"snapc %p seq %lld (%d snaps)\n",
- mapping->host, page, page->index,
+ mapping->host, folio, folio->index,
ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
snapc, snapc->seq, snapc->num_snaps);
spin_unlock(&ci->i_ceph_lock);
/*
- * Reference snap context in page->private. Also set
- * PagePrivate so that we get invalidatepage callback.
+ * Reference snap context in folio->private. Also set
+ * PagePrivate so that we get invalidate_folio callback.
*/
- BUG_ON(PagePrivate(page));
- attach_page_private(page, snapc);
+ BUG_ON(folio_get_private(folio));
+ folio_attach_private(folio, snapc);
- return ceph_fscache_set_page_dirty(page);
+ return ceph_fscache_dirty_folio(mapping, folio);
}
/*
- * If we are truncating the full page (i.e. offset == 0), adjust the
- * dirty page counters appropriately. Only called if there is private
- * data on the page.
+ * If we are truncating the full folio (i.e. offset == 0), adjust the
+ * dirty folio counters appropriately. Only called if there is private
+ * data on the folio.
*/
-static void ceph_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void ceph_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
struct inode *inode;
struct ceph_inode_info *ci;
struct ceph_snap_context *snapc;
- inode = page->mapping->host;
+ inode = folio->mapping->host;
ci = ceph_inode(inode);
- if (offset != 0 || length != thp_size(page)) {
- dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n",
- inode, page, page->index, offset, length);
+ if (offset != 0 || length != folio_size(folio)) {
+ dout("%p invalidate_folio idx %lu partial dirty page %zu~%zu\n",
+ inode, folio->index, offset, length);
return;
}
- WARN_ON(!PageLocked(page));
- if (PagePrivate(page)) {
- dout("%p invalidatepage %p idx %lu full dirty page\n",
- inode, page, page->index);
+ WARN_ON(!folio_test_locked(folio));
+ if (folio_get_private(folio)) {
+ dout("%p invalidate_folio idx %lu full dirty page\n",
+ inode, folio->index);
- snapc = detach_page_private(page);
+ snapc = folio_detach_private(folio);
ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
ceph_put_snap_context(snapc);
}
- wait_on_page_fscache(page);
+ folio_wait_fscache(folio);
}
static int ceph_releasepage(struct page *page, gfp_t gfp)
@@ -183,9 +182,9 @@ static int ceph_releasepage(struct page *page, gfp_t gfp)
return 1;
}
-static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq)
+static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq)
{
- struct inode *inode = rreq->mapping->host;
+ struct inode *inode = rreq->inode;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_file_layout *lo = &ci->i_layout;
u32 blockoff;
@@ -200,9 +199,9 @@ static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq)
rreq->len = roundup(rreq->len, lo->stripe_unit);
}
-static bool ceph_netfs_clamp_length(struct netfs_read_subrequest *subreq)
+static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq)
{
- struct inode *inode = subreq->rreq->mapping->host;
+ struct inode *inode = subreq->rreq->inode;
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
u64 objno, objoff;
@@ -219,7 +218,7 @@ static void finish_netfs_read(struct ceph_osd_request *req)
{
struct ceph_fs_client *fsc = ceph_inode_to_client(req->r_inode);
struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
- struct netfs_read_subrequest *subreq = req->r_priv;
+ struct netfs_io_subrequest *subreq = req->r_priv;
int num_pages;
int err = req->r_result;
@@ -245,10 +244,63 @@ static void finish_netfs_read(struct ceph_osd_request *req)
iput(req->r_inode);
}
-static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq)
+static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
+{
+ struct netfs_io_request *rreq = subreq->rreq;
+ struct inode *inode = rreq->inode;
+ struct ceph_mds_reply_info_parsed *rinfo;
+ struct ceph_mds_reply_info_in *iinfo;
+ struct ceph_mds_request *req;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct iov_iter iter;
+ ssize_t err = 0;
+ size_t len;
+
+ __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+ __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
+
+ if (subreq->start >= inode->i_size)
+ goto out;
+
+ /* We need to fetch the inline data. */
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out;
+ }
+ req->r_ino1 = ci->i_vino;
+ req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA);
+ req->r_num_caps = 2;
+
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (err < 0)
+ goto out;
+
+ rinfo = &req->r_reply_info;
+ iinfo = &rinfo->targeti;
+ if (iinfo->inline_version == CEPH_INLINE_NONE) {
+ /* The data got uninlined */
+ ceph_mdsc_put_request(req);
+ return false;
+ }
+
+ len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len);
+ iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len);
+ err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter);
+ if (err == 0)
+ err = -EFAULT;
+
+ ceph_mdsc_put_request(req);
+out:
+ netfs_subreq_terminated(subreq, err, false);
+ return true;
+}
+
+static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
{
- struct netfs_read_request *rreq = subreq->rreq;
- struct inode *inode = rreq->mapping->host;
+ struct netfs_io_request *rreq = subreq->rreq;
+ struct inode *inode = rreq->inode;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_osd_request *req;
@@ -259,6 +311,10 @@ static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq)
int err = 0;
u64 len = subreq->len;
+ if (ci->i_inline_version != CEPH_INLINE_NONE &&
+ ceph_netfs_issue_op_inline(subreq))
+ return;
+
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len,
0, 1, CEPH_OSD_OP_READ,
CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica,
@@ -297,6 +353,45 @@ out:
dout("%s: result %d\n", __func__, err);
}
+static int ceph_init_request(struct netfs_io_request *rreq, struct file *file)
+{
+ struct inode *inode = rreq->inode;
+ int got = 0, want = CEPH_CAP_FILE_CACHE;
+ int ret = 0;
+
+ if (rreq->origin != NETFS_READAHEAD)
+ return 0;
+
+ if (file) {
+ struct ceph_rw_context *rw_ctx;
+ struct ceph_file_info *fi = file->private_data;
+
+ rw_ctx = ceph_find_rw_context(fi);
+ if (rw_ctx)
+ return 0;
+ }
+
+ /*
+ * readahead callers do not necessarily hold Fcb caps
+ * (e.g. fadvise, madvise).
+ */
+ ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got);
+ if (ret < 0) {
+ dout("start_read %p, error getting cap\n", inode);
+ return ret;
+ }
+
+ if (!(got & want)) {
+ dout("start_read %p, no cache cap\n", inode);
+ return -EACCES;
+ }
+ if (ret == 0)
+ return -EACCES;
+
+ rreq->netfs_priv = (void *)(uintptr_t)got;
+ return 0;
+}
+
static void ceph_readahead_cleanup(struct address_space *mapping, void *priv)
{
struct inode *inode = mapping->host;
@@ -307,78 +402,16 @@ static void ceph_readahead_cleanup(struct address_space *mapping, void *priv)
ceph_put_cap_refs(ci, got);
}
-static const struct netfs_read_request_ops ceph_netfs_read_ops = {
- .is_cache_enabled = ceph_is_cache_enabled,
+const struct netfs_request_ops ceph_netfs_ops = {
+ .init_request = ceph_init_request,
.begin_cache_operation = ceph_begin_cache_operation,
- .issue_op = ceph_netfs_issue_op,
+ .issue_read = ceph_netfs_issue_read,
.expand_readahead = ceph_netfs_expand_readahead,
.clamp_length = ceph_netfs_clamp_length,
.check_write_begin = ceph_netfs_check_write_begin,
.cleanup = ceph_readahead_cleanup,
};
-/* read a single page, without unlocking it. */
-static int ceph_readpage(struct file *file, struct page *subpage)
-{
- struct folio *folio = page_folio(subpage);
- struct inode *inode = file_inode(file);
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_vino vino = ceph_vino(inode);
- size_t len = folio_size(folio);
- u64 off = folio_file_pos(folio);
-
- if (ci->i_inline_version != CEPH_INLINE_NONE) {
- /*
- * Uptodate inline data should have been added
- * into page cache while getting Fcr caps.
- */
- if (off == 0) {
- folio_unlock(folio);
- return -EINVAL;
- }
- zero_user_segment(&folio->page, 0, folio_size(folio));
- folio_mark_uptodate(folio);
- folio_unlock(folio);
- return 0;
- }
-
- dout("readpage ino %llx.%llx file %p off %llu len %zu folio %p index %lu\n",
- vino.ino, vino.snap, file, off, len, folio, folio_index(folio));
-
- return netfs_readpage(file, folio, &ceph_netfs_read_ops, NULL);
-}
-
-static void ceph_readahead(struct readahead_control *ractl)
-{
- struct inode *inode = file_inode(ractl->file);
- struct ceph_file_info *fi = ractl->file->private_data;
- struct ceph_rw_context *rw_ctx;
- int got = 0;
- int ret = 0;
-
- if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE)
- return;
-
- rw_ctx = ceph_find_rw_context(fi);
- if (!rw_ctx) {
- /*
- * readahead callers do not necessarily hold Fcb caps
- * (e.g. fadvise, madvise).
- */
- int want = CEPH_CAP_FILE_CACHE;
-
- ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got);
- if (ret < 0)
- dout("start_read %p, error getting cap\n", inode);
- else if (!(got & want))
- dout("start_read %p, no cache cap\n", inode);
-
- if (ret <= 0)
- return;
- }
- netfs_readahead(ractl, &ceph_netfs_read_ops, (void *)(uintptr_t)got);
-}
-
#ifdef CONFIG_CEPH_FSCACHE
static void ceph_set_page_fscache(struct page *page)
{
@@ -516,6 +549,7 @@ static u64 get_writepages_data_length(struct inode *inode,
*/
static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
{
+ struct folio *folio = page_folio(page);
struct inode *inode = page->mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
@@ -550,8 +584,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
/* is this a partial page at end of file? */
if (page_off >= ceph_wbc.i_size) {
- dout("%p page eof %llu\n", page, ceph_wbc.i_size);
- page->mapping->a_ops->invalidatepage(page, 0, thp_size(page));
+ dout("folio at %lu beyond eof %llu\n", folio->index,
+ ceph_wbc.i_size);
+ folio_invalidate(folio, 0, folio_size(folio));
return 0;
}
@@ -563,7 +598,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
if (atomic_long_inc_return(&fsc->writeback_count) >
CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
- set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
+ fsc->write_congested = true;
req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1,
CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc,
@@ -623,7 +658,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
if (atomic_long_dec_return(&fsc->writeback_count) <
CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
- clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
+ fsc->write_congested = false;
return err;
}
@@ -635,6 +670,10 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
BUG_ON(!inode);
ihold(inode);
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ ceph_inode_to_client(inode)->write_congested)
+ return AOP_WRITEPAGE_ACTIVATE;
+
wait_on_page_fscache(page);
err = writepage_nounlock(page, wbc);
@@ -707,8 +746,7 @@ static void writepages_finish(struct ceph_osd_request *req)
if (atomic_long_dec_return(&fsc->writeback_count) <
CONGESTION_OFF_THRESH(
fsc->mount_options->congestion_kb))
- clear_bdi_congested(inode_to_bdi(inode),
- BLK_RW_ASYNC);
+ fsc->write_congested = false;
ceph_put_snap_context(detach_page_private(page));
end_page_writeback(page);
@@ -760,6 +798,10 @@ static int ceph_writepages_start(struct address_space *mapping,
bool done = false;
bool caching = ceph_is_cache_enabled(inode);
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ fsc->write_congested)
+ return 0;
+
dout("writepages_start %p (mode=%s)\n", inode,
wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
(wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
@@ -867,14 +909,16 @@ get_more_pages:
continue;
}
if (page_offset(page) >= ceph_wbc.i_size) {
- dout("%p page eof %llu\n",
- page, ceph_wbc.i_size);
+ struct folio *folio = page_folio(page);
+
+ dout("folio at %lu beyond eof %llu\n",
+ folio->index, ceph_wbc.i_size);
if ((ceph_wbc.size_stable ||
- page_offset(page) >= i_size_read(inode)) &&
- clear_page_dirty_for_io(page))
- mapping->a_ops->invalidatepage(page,
- 0, thp_size(page));
- unlock_page(page);
+ folio_pos(folio) >= i_size_read(inode)) &&
+ folio_clear_dirty_for_io(folio))
+ folio_invalidate(folio, 0,
+ folio_size(folio));
+ folio_unlock(folio);
continue;
}
if (strip_unit_end && (page->index > strip_unit_end)) {
@@ -954,11 +998,8 @@ get_more_pages:
if (atomic_long_inc_return(&fsc->writeback_count) >
CONGESTION_ON_THRESH(
- fsc->mount_options->congestion_kb)) {
- set_bdi_congested(inode_to_bdi(inode),
- BLK_RW_ASYNC);
- }
-
+ fsc->mount_options->congestion_kb))
+ fsc->write_congested = true;
pages[locked_pages++] = page;
pvec.pages[i] = NULL;
@@ -1274,45 +1315,10 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
struct inode *inode = file_inode(file);
- struct ceph_inode_info *ci = ceph_inode(inode);
struct folio *folio = NULL;
- pgoff_t index = pos >> PAGE_SHIFT;
int r;
- /*
- * Uninlining should have already been done and everything updated, EXCEPT
- * for inline_version sent to the MDS.
- */
- if (ci->i_inline_version != CEPH_INLINE_NONE) {
- unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
- if (aop_flags & AOP_FLAG_NOFS)
- fgp_flags |= FGP_NOFS;
- folio = __filemap_get_folio(mapping, index, fgp_flags,
- mapping_gfp_mask(mapping));
- if (!folio)
- return -ENOMEM;
-
- /*
- * The inline_version on a new inode is set to 1. If that's the
- * case, then the folio is brand new and isn't yet Uptodate.
- */
- r = 0;
- if (index == 0 && ci->i_inline_version != 1) {
- if (!folio_test_uptodate(folio)) {
- WARN_ONCE(1, "ceph: write_begin called on still-inlined inode (inline_version %llu)!\n",
- ci->i_inline_version);
- r = -EINVAL;
- }
- goto out;
- }
- zero_user_segment(&folio->page, 0, folio_size(folio));
- folio_mark_uptodate(folio);
- goto out;
- }
-
- r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &folio, NULL,
- &ceph_netfs_read_ops, NULL);
-out:
+ r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &folio, NULL);
if (r == 0)
folio_wait_fscache(folio);
if (r < 0) {
@@ -1366,14 +1372,14 @@ out:
}
const struct address_space_operations ceph_aops = {
- .readpage = ceph_readpage,
- .readahead = ceph_readahead,
+ .readpage = netfs_readpage,
+ .readahead = netfs_readahead,
.writepage = ceph_writepage,
.writepages = ceph_writepages_start,
.write_begin = ceph_write_begin,
.write_end = ceph_write_end,
- .set_page_dirty = ceph_set_page_dirty,
- .invalidatepage = ceph_invalidatepage,
+ .dirty_folio = ceph_dirty_folio,
+ .invalidate_folio = ceph_invalidate_folio,
.releasepage = ceph_releasepage,
.direct_IO = noop_direct_IO,
};
@@ -1508,19 +1514,6 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
sb_start_pagefault(inode->i_sb);
ceph_block_sigs(&oldset);
- if (ci->i_inline_version != CEPH_INLINE_NONE) {
- struct page *locked_page = NULL;
- if (off == 0) {
- lock_page(page);
- locked_page = page;
- }
- err = ceph_uninline_data(vma->vm_file, locked_page);
- if (locked_page)
- unlock_page(locked_page);
- if (err < 0)
- goto out_free;
- }
-
if (off + thp_size(page) <= size)
len = thp_size(page);
else
@@ -1577,11 +1570,9 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
ceph_put_snap_context(snapc);
} while (err == 0);
- if (ret == VM_FAULT_LOCKED ||
- ci->i_inline_version != CEPH_INLINE_NONE) {
+ if (ret == VM_FAULT_LOCKED) {
int dirty;
spin_lock(&ci->i_ceph_lock);
- ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
&prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
@@ -1645,16 +1636,30 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
}
}
-int ceph_uninline_data(struct file *filp, struct page *locked_page)
+int ceph_uninline_data(struct file *file)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_osd_request *req;
- struct page *page = NULL;
- u64 len, inline_version;
+ struct ceph_cap_flush *prealloc_cf;
+ struct folio *folio = NULL;
+ u64 inline_version = CEPH_INLINE_NONE;
+ struct page *pages[1];
int err = 0;
- bool from_pagecache = false;
+ u64 len;
+
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ return -ENOMEM;
+
+ folio = read_mapping_folio(inode->i_mapping, 0, file);
+ if (IS_ERR(folio)) {
+ err = PTR_ERR(folio);
+ goto out;
+ }
+
+ folio_lock(folio);
spin_lock(&ci->i_ceph_lock);
inline_version = ci->i_inline_version;
@@ -1665,45 +1670,11 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
if (inline_version == 1 || /* initial version, no data */
inline_version == CEPH_INLINE_NONE)
- goto out;
+ goto out_unlock;
- if (locked_page) {
- page = locked_page;
- WARN_ON(!PageUptodate(page));
- } else if (ceph_caps_issued(ci) &
- (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) {
- page = find_get_page(inode->i_mapping, 0);
- if (page) {
- if (PageUptodate(page)) {
- from_pagecache = true;
- lock_page(page);
- } else {
- put_page(page);
- page = NULL;
- }
- }
- }
-
- if (page) {
- len = i_size_read(inode);
- if (len > PAGE_SIZE)
- len = PAGE_SIZE;
- } else {
- page = __page_cache_alloc(GFP_NOFS);
- if (!page) {
- err = -ENOMEM;
- goto out;
- }
- err = __ceph_do_getattr(inode, page,
- CEPH_STAT_CAP_INLINE_DATA, true);
- if (err < 0) {
- /* no inline data */
- if (err == -ENODATA)
- err = 0;
- goto out;
- }
- len = err;
- }
+ len = i_size_read(inode);
+ if (len > folio_size(folio))
+ len = folio_size(folio);
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
ceph_vino(inode), 0, &len, 0, 1,
@@ -1711,7 +1682,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
NULL, 0, 0, false);
if (IS_ERR(req)) {
err = PTR_ERR(req);
- goto out;
+ goto out_unlock;
}
req->r_mtime = inode->i_mtime;
@@ -1720,7 +1691,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
err = ceph_osdc_wait_request(&fsc->client->osdc, req);
ceph_osdc_put_request(req);
if (err < 0)
- goto out;
+ goto out_unlock;
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
ceph_vino(inode), 0, &len, 1, 3,
@@ -1729,10 +1700,11 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
ci->i_truncate_size, false);
if (IS_ERR(req)) {
err = PTR_ERR(req);
- goto out;
+ goto out_unlock;
}
- osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false);
+ pages[0] = folio_page(folio, 0);
+ osd_req_op_extent_osd_data_pages(req, 1, pages, len, 0, false, false);
{
__le64 xattr_buf = cpu_to_le64(inline_version);
@@ -1742,7 +1714,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
CEPH_OSD_CMPXATTR_OP_GT,
CEPH_OSD_CMPXATTR_MODE_U64);
if (err)
- goto out_put;
+ goto out_put_req;
}
{
@@ -1753,7 +1725,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
"inline_version",
xattr_buf, xattr_len, 0, 0);
if (err)
- goto out_put;
+ goto out_put_req;
}
req->r_mtime = inode->i_mtime;
@@ -1764,19 +1736,28 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, len, err);
-out_put:
+ if (!err) {
+ int dirty;
+
+ /* Set to CAP_INLINE_NONE and dirty the caps */
+ down_read(&fsc->mdsc->snap_rwsem);
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_inline_version = CEPH_INLINE_NONE;
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf);
+ spin_unlock(&ci->i_ceph_lock);
+ up_read(&fsc->mdsc->snap_rwsem);
+ if (dirty)
+ __mark_inode_dirty(inode, dirty);
+ }
+out_put_req:
ceph_osdc_put_request(req);
if (err == -ECANCELED)
err = 0;
+out_unlock:
+ folio_unlock(folio);
+ folio_put(folio);
out:
- if (page && page != locked_page) {
- if (from_pagecache) {
- unlock_page(page);
- put_page(page);
- } else
- __free_pages(page, 0);
- }
-
+ ceph_free_cap_flush(prealloc_cf);
dout("uninline_data %p %llx.%llx inline_version %llu = %d\n",
inode, ceph_vinop(inode), inline_version, err);
return err;
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 7d22850623ef..ddea99922073 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -29,26 +29,25 @@ void ceph_fscache_register_inode_cookie(struct inode *inode)
if (!(inode->i_state & I_NEW))
return;
- WARN_ON_ONCE(ci->fscache);
+ WARN_ON_ONCE(ci->netfs_ctx.cache);
- ci->fscache = fscache_acquire_cookie(fsc->fscache, 0,
- &ci->i_vino, sizeof(ci->i_vino),
- &ci->i_version, sizeof(ci->i_version),
- i_size_read(inode));
+ ci->netfs_ctx.cache =
+ fscache_acquire_cookie(fsc->fscache, 0,
+ &ci->i_vino, sizeof(ci->i_vino),
+ &ci->i_version, sizeof(ci->i_version),
+ i_size_read(inode));
}
-void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
+void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info *ci)
{
- struct fscache_cookie *cookie = ci->fscache;
-
- fscache_relinquish_cookie(cookie, false);
+ fscache_relinquish_cookie(ceph_fscache_cookie(ci), false);
}
void ceph_fscache_use_cookie(struct inode *inode, bool will_modify)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- fscache_use_cookie(ci->fscache, will_modify);
+ fscache_use_cookie(ceph_fscache_cookie(ci), will_modify);
}
void ceph_fscache_unuse_cookie(struct inode *inode, bool update)
@@ -58,9 +57,10 @@ void ceph_fscache_unuse_cookie(struct inode *inode, bool update)
if (update) {
loff_t i_size = i_size_read(inode);
- fscache_unuse_cookie(ci->fscache, &ci->i_version, &i_size);
+ fscache_unuse_cookie(ceph_fscache_cookie(ci),
+ &ci->i_version, &i_size);
} else {
- fscache_unuse_cookie(ci->fscache, NULL, NULL);
+ fscache_unuse_cookie(ceph_fscache_cookie(ci), NULL, NULL);
}
}
@@ -69,14 +69,14 @@ void ceph_fscache_update(struct inode *inode)
struct ceph_inode_info *ci = ceph_inode(inode);
loff_t i_size = i_size_read(inode);
- fscache_update_cookie(ci->fscache, &ci->i_version, &i_size);
+ fscache_update_cookie(ceph_fscache_cookie(ci), &ci->i_version, &i_size);
}
void ceph_fscache_invalidate(struct inode *inode, bool dio_write)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- fscache_invalidate(ceph_inode(inode)->fscache,
+ fscache_invalidate(ceph_fscache_cookie(ci),
&ci->i_version, i_size_read(inode),
dio_write ? FSCACHE_INVAL_DIO_WRITE : 0);
}
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index 09164389fa66..7255b790a4c1 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -26,14 +26,9 @@ void ceph_fscache_unuse_cookie(struct inode *inode, bool update);
void ceph_fscache_update(struct inode *inode);
void ceph_fscache_invalidate(struct inode *inode, bool dio_write);
-static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci)
-{
- ci->fscache = NULL;
-}
-
static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci)
{
- return ci->fscache;
+ return netfs_i_cookie(&ci->vfs_inode);
}
static inline void ceph_fscache_resize(struct inode *inode, loff_t to)
@@ -54,15 +49,15 @@ static inline void ceph_fscache_unpin_writeback(struct inode *inode,
fscache_unpin_writeback(wbc, ceph_fscache_cookie(ceph_inode(inode)));
}
-static inline int ceph_fscache_set_page_dirty(struct page *page)
+static inline int ceph_fscache_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
{
- struct inode *inode = page->mapping->host;
- struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_inode_info *ci = ceph_inode(mapping->host);
- return fscache_set_page_dirty(page, ceph_fscache_cookie(ci));
+ return fscache_dirty_folio(mapping, folio, ceph_fscache_cookie(ci));
}
-static inline int ceph_begin_cache_operation(struct netfs_read_request *rreq)
+static inline int ceph_begin_cache_operation(struct netfs_io_request *rreq)
{
struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(rreq->inode));
@@ -91,10 +86,6 @@ static inline void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
{
}
-static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci)
-{
-}
-
static inline void ceph_fscache_register_inode_cookie(struct inode *inode)
{
}
@@ -133,9 +124,10 @@ static inline void ceph_fscache_unpin_writeback(struct inode *inode,
{
}
-static inline int ceph_fscache_set_page_dirty(struct page *page)
+static inline int ceph_fscache_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
{
- return __set_page_dirty_nobuffers(page);
+ return filemap_dirty_folio(mapping, folio);
}
static inline bool ceph_is_cache_enabled(struct inode *inode)
@@ -143,7 +135,7 @@ static inline bool ceph_is_cache_enabled(struct inode *inode)
return false;
}
-static inline int ceph_begin_cache_operation(struct netfs_read_request *rreq)
+static inline int ceph_begin_cache_operation(struct netfs_io_request *rreq)
{
return -ENOBUFS;
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index b472cd066d1c..5c14ef04e474 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1915,6 +1915,13 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
ceph_get_mds_session(session);
spin_lock(&ci->i_ceph_lock);
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
+ /* Don't send messages until we get async create reply */
+ spin_unlock(&ci->i_ceph_lock);
+ ceph_put_mds_session(session);
+ return;
+ }
+
if (ci->i_ceph_flags & CEPH_I_FLUSH)
flags |= CHECK_CAPS_FLUSH;
retry:
@@ -2267,6 +2274,8 @@ retry:
list_for_each_entry(req, &ci->i_unsafe_dirops,
r_unsafe_dir_item) {
s = req->r_session;
+ if (!s)
+ continue;
if (unlikely(s->s_mds >= max_sessions)) {
spin_unlock(&ci->i_unsafe_lock);
for (i = 0; i < max_sessions; i++) {
@@ -2287,6 +2296,8 @@ retry:
list_for_each_entry(req, &ci->i_unsafe_iops,
r_unsafe_target_item) {
s = req->r_session;
+ if (!s)
+ continue;
if (unlikely(s->s_mds >= max_sessions)) {
spin_unlock(&ci->i_unsafe_lock);
for (i = 0; i < max_sessions; i++) {
@@ -2409,6 +2420,9 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
dout("write_inode %p wait=%d\n", inode, wait);
ceph_fscache_unpin_writeback(inode, wbc);
if (wait) {
+ err = ceph_wait_on_async_create(inode);
+ if (err)
+ return err;
dirty = try_flush_caps(inode, &flush_tid);
if (dirty)
err = wait_event_interruptible(ci->i_cap_wq,
@@ -2439,6 +2453,10 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
u64 first_tid = 0;
u64 last_snap_flush = 0;
+ /* Don't do anything until create reply comes in */
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE)
+ return;
+
ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) {
@@ -3856,6 +3874,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n",
inode, ci, mds, mseq, target);
retry:
+ down_read(&mdsc->snap_rwsem);
spin_lock(&ci->i_ceph_lock);
cap = __get_cap_for_mds(ci, mds);
if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id))
@@ -3919,6 +3938,7 @@ retry:
}
spin_unlock(&ci->i_ceph_lock);
+ up_read(&mdsc->snap_rwsem);
mutex_unlock(&session->s_mutex);
/* open target session */
@@ -3944,6 +3964,7 @@ retry:
out_unlock:
spin_unlock(&ci->i_ceph_lock);
+ up_read(&mdsc->snap_rwsem);
mutex_unlock(&session->s_mutex);
if (tsession) {
mutex_unlock(&tsession->s_mutex);
@@ -4152,7 +4173,6 @@ void ceph_handle_caps(struct ceph_mds_session *session,
/* lookup ino */
inode = ceph_find_inode(mdsc->fsc->sb, vino);
- ci = ceph_inode(inode);
dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
vino.snap, inode);
@@ -4178,6 +4198,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
}
goto flush_cap_releases;
}
+ ci = ceph_inode(inode);
/* these will work even if we don't have a cap yet */
switch (op) {
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 3cf7c9c1085b..bec3c4549c07 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -175,7 +175,7 @@ static int metrics_latency_show(struct seq_file *s, void *p)
struct ceph_fs_client *fsc = s->private;
struct ceph_client_metric *cm = &fsc->mdsc->metric;
struct ceph_metric *m;
- s64 total, sum, avg, min, max, sq;
+ s64 total, avg, min, max, sq;
int i;
seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n");
@@ -185,8 +185,7 @@ static int metrics_latency_show(struct seq_file *s, void *p)
m = &cm->metric[i];
spin_lock(&m->lock);
total = m->total;
- sum = m->latency_sum;
- avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0;
+ avg = m->latency_avg;
min = m->latency_min;
max = m->latency_max;
sq = m->latency_sq_sum;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 133dbd9338e7..eae417d71136 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -145,7 +145,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx,
return ERR_PTR(-EAGAIN);
}
/* reading/filling the cache are serialized by
- i_mutex, no need to use page lock */
+ i_rwsem, no need to use page lock */
unlock_page(cache_ctl->page);
cache_ctl->dentries = kmap(cache_ctl->page);
}
@@ -155,7 +155,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx,
rcu_read_lock();
spin_lock(&parent->d_lock);
/* check i_size again here, because empty directory can be
- * marked as complete while not holding the i_mutex. */
+ * marked as complete while not holding the i_rwsem. */
if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir))
dentry = cache_ctl->dentries[cache_ctl->index];
else
@@ -478,8 +478,11 @@ more:
2 : (fpos_off(rde->offset) + 1);
err = note_last_dentry(dfi, rde->name, rde->name_len,
next_offset);
- if (err)
+ if (err) {
+ ceph_mdsc_put_request(dfi->last_readdir);
+ dfi->last_readdir = NULL;
return err;
+ }
} else if (req->r_reply_info.dir_end) {
dfi->next_offset = 2;
/* keep last name */
@@ -520,6 +523,12 @@ more:
if (!dir_emit(ctx, rde->name, rde->name_len,
ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
le32_to_cpu(rde->inode.in->mode) >> 12)) {
+ /*
+ * NOTE: Here no need to put the 'dfi->last_readdir',
+ * because when dir_emit stops us it's most likely
+ * doesn't have enough memory, etc. So for next readdir
+ * it will continue.
+ */
dout("filldir stopping us...\n");
return 0;
}
@@ -671,7 +680,7 @@ struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req,
struct dentry *dentry)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
- struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */
+ struct inode *parent = d_inode(dentry->d_parent); /* we hold i_rwsem */
/* .snap dir? */
if (ceph_snap(parent) == CEPH_NOSNAP &&
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index bbed3224ad68..6c9e837aa1d3 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -207,6 +207,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
struct ceph_mount_options *opt =
ceph_inode_to_client(&ci->vfs_inode)->mount_options;
struct ceph_file_info *fi;
+ int ret;
dout("%s %p %p 0%o (%s)\n", __func__, inode, file,
inode->i_mode, isdir ? "dir" : "regular");
@@ -240,7 +241,22 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
INIT_LIST_HEAD(&fi->rw_contexts);
fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen);
+ if ((file->f_mode & FMODE_WRITE) &&
+ ci->i_inline_version != CEPH_INLINE_NONE) {
+ ret = ceph_uninline_data(file);
+ if (ret < 0)
+ goto error;
+ }
+
return 0;
+
+error:
+ ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE);
+ ceph_put_fmode(ci, fi->fmode, 1);
+ kmem_cache_free(ceph_file_cachep, fi);
+ /* wake up anyone waiting for caps on this inode */
+ wake_up_all(&ci->i_cap_wq);
+ return ret;
}
/*
@@ -516,52 +532,67 @@ static void restore_deleg_ino(struct inode *dir, u64 ino)
}
}
+static void wake_async_create_waiters(struct inode *inode,
+ struct ceph_mds_session *session)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
+ ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE;
+ wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT);
+ }
+ ceph_kick_flushing_inode_caps(session, ci);
+ spin_unlock(&ci->i_ceph_lock);
+}
+
static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req)
{
+ struct dentry *dentry = req->r_dentry;
+ struct inode *dinode = d_inode(dentry);
+ struct inode *tinode = req->r_target_inode;
int result = req->r_err ? req->r_err :
le32_to_cpu(req->r_reply_info.head->result);
+ WARN_ON_ONCE(dinode && tinode && dinode != tinode);
+
+ /* MDS changed -- caller must resubmit */
if (result == -EJUKEBOX)
goto out;
mapping_set_error(req->r_parent->i_mapping, result);
if (result) {
- struct dentry *dentry = req->r_dentry;
- struct inode *inode = d_inode(dentry);
int pathlen = 0;
u64 base = 0;
char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
&base, 0);
+ pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n",
+ base, IS_ERR(path) ? "<<bad>>" : path, result);
+ ceph_mdsc_free_path(path, pathlen);
+
ceph_dir_clear_complete(req->r_parent);
if (!d_unhashed(dentry))
d_drop(dentry);
- ceph_inode_shutdown(inode);
-
- pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n",
- base, IS_ERR(path) ? "<<bad>>" : path, result);
- ceph_mdsc_free_path(path, pathlen);
+ if (dinode) {
+ mapping_set_error(dinode->i_mapping, result);
+ ceph_inode_shutdown(dinode);
+ wake_async_create_waiters(dinode, req->r_session);
+ }
}
- if (req->r_target_inode) {
- struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
- u64 ino = ceph_vino(req->r_target_inode).ino;
+ if (tinode) {
+ u64 ino = ceph_vino(tinode).ino;
if (req->r_deleg_ino != ino)
pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n",
__func__, req->r_err, req->r_deleg_ino, ino);
- mapping_set_error(req->r_target_inode->i_mapping, result);
- spin_lock(&ci->i_ceph_lock);
- if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
- ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE;
- wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT);
- }
- ceph_kick_flushing_inode_caps(req->r_session, ci);
- spin_unlock(&ci->i_ceph_lock);
+ mapping_set_error(tinode->i_mapping, result);
+ wake_async_create_waiters(tinode, req->r_session);
} else if (!result) {
pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__,
req->r_deleg_ino);
@@ -1041,7 +1072,6 @@ static void ceph_aio_complete(struct inode *inode,
}
spin_lock(&ci->i_ceph_lock);
- ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
&aio_req->prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
@@ -1778,12 +1808,6 @@ retry_snap:
if (err)
goto out;
- if (ci->i_inline_version != CEPH_INLINE_NONE) {
- err = ceph_uninline_data(file, NULL);
- if (err < 0)
- goto out;
- }
-
dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
inode, ceph_vinop(inode), pos, count, i_size_read(inode));
if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
@@ -1845,7 +1869,7 @@ retry_snap:
* are pending vmtruncate. So write and vmtruncate
* can not run at the same time
*/
- written = generic_perform_write(file, from, pos);
+ written = generic_perform_write(iocb, from);
if (likely(written >= 0))
iocb->ki_pos = pos + written;
ceph_end_io_write(inode);
@@ -1855,7 +1879,6 @@ retry_snap:
int dirty;
spin_lock(&ci->i_ceph_lock);
- ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
&prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
@@ -2109,12 +2132,6 @@ static long ceph_fallocate(struct file *file, int mode,
goto unlock;
}
- if (ci->i_inline_version != CEPH_INLINE_NONE) {
- ret = ceph_uninline_data(file, NULL);
- if (ret < 0)
- goto unlock;
- }
-
size = i_size_read(inode);
/* Are we punching a hole beyond EOF? */
@@ -2139,7 +2156,6 @@ static long ceph_fallocate(struct file *file, int mode,
if (!ret) {
spin_lock(&ci->i_ceph_lock);
- ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
&prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
@@ -2532,7 +2548,6 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
}
/* Mark Fw dirty */
spin_lock(&dst_ci->i_ceph_lock);
- dst_ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(dst_ci, CEPH_CAP_FILE_WR, &prealloc_cf);
spin_unlock(&dst_ci->i_ceph_lock);
if (dirty)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index ef4a980a7bf3..63113e2a4890 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -87,13 +87,13 @@ struct inode *ceph_get_snapdir(struct inode *parent)
if (!S_ISDIR(parent->i_mode)) {
pr_warn_once("bad snapdir parent type (mode=0%o)\n",
parent->i_mode);
- return ERR_PTR(-ENOTDIR);
+ goto err;
}
if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) {
pr_warn_once("bad snapdir inode type (mode=0%o)\n",
inode->i_mode);
- return ERR_PTR(-ENOTDIR);
+ goto err;
}
inode->i_mode = parent->i_mode;
@@ -113,6 +113,12 @@ struct inode *ceph_get_snapdir(struct inode *parent)
}
return inode;
+err:
+ if ((inode->i_state & I_NEW))
+ discard_new_inode(inode);
+ else
+ iput(inode);
+ return ERR_PTR(-ENOTDIR);
}
const struct inode_operations ceph_file_iops = {
@@ -447,12 +453,15 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
struct ceph_inode_info *ci;
int i;
- ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS);
+ ci = alloc_inode_sb(sb, ceph_inode_cachep, GFP_NOFS);
if (!ci)
return NULL;
dout("alloc_inode %p\n", &ci->vfs_inode);
+ /* Set parameters for the netfs library */
+ netfs_i_context_init(&ci->vfs_inode, &ceph_netfs_ops);
+
spin_lock_init(&ci->i_ceph_lock);
ci->i_version = 0;
@@ -538,9 +547,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
INIT_WORK(&ci->i_work, ceph_inode_work);
ci->i_work_mask = 0;
memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
-
- ceph_fscache_inode_init(ci);
-
return &ci->vfs_inode;
}
@@ -1201,7 +1207,7 @@ out_unlock:
/*
* splice a dentry to an inode.
- * caller must hold directory i_mutex for this to be safe.
+ * caller must hold directory i_rwsem for this to be safe.
*/
static int splice_dentry(struct dentry **pdn, struct inode *in)
{
@@ -1598,7 +1604,7 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
return idx == 0 ? -ENOMEM : 0;
}
/* reading/filling the cache are serialized by
- * i_mutex, no need to use page lock */
+ * i_rwsem, no need to use page lock */
unlock_page(ctl->page);
ctl->dentries = kmap(ctl->page);
if (idx == 0)
@@ -2301,6 +2307,57 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
return err;
}
+int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
+ size_t size)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_request *req;
+ int mode = USE_AUTH_MDS;
+ int err;
+ char *xattr_value;
+ size_t xattr_value_len;
+
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode);
+ if (IS_ERR(req)) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ req->r_path2 = kstrdup(name, GFP_NOFS);
+ if (!req->r_path2) {
+ err = -ENOMEM;
+ goto put;
+ }
+
+ ihold(inode);
+ req->r_inode = inode;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (err < 0)
+ goto put;
+
+ xattr_value = req->r_reply_info.xattr_info.xattr_value;
+ xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len;
+
+ dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
+
+ err = (int)xattr_value_len;
+ if (size == 0)
+ goto put;
+
+ if (xattr_value_len > size) {
+ err = -ERANGE;
+ goto put;
+ }
+
+ memcpy(value, xattr_value, xattr_value_len);
+put:
+ ceph_mdsc_put_request(req);
+out:
+ dout("do_getvxattr result=%d\n", err);
+ return err;
+}
+
/*
* Check inode permissions. We verify we have a valid value for
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index d1f154aec249..3e2843e86e27 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -111,10 +111,10 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
req->r_args.filelock_change.length = cpu_to_le64(length);
req->r_args.filelock_change.wait = wait;
- if (wait)
- req->r_wait_for_completion = ceph_lock_wait_for_completion;
-
- err = ceph_mdsc_do_request(mdsc, inode, req);
+ err = ceph_mdsc_submit_request(mdsc, inode, req);
+ if (!err)
+ err = ceph_mdsc_wait_request(mdsc, req, wait ?
+ ceph_lock_wait_for_completion : NULL);
if (!err && operation == CEPH_MDS_OP_GETFILELOCK) {
fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid);
if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index c30eefc0ac19..00c3de177dd6 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -555,6 +555,28 @@ bad:
return -EIO;
}
+static int parse_reply_info_getvxattr(void **p, void *end,
+ struct ceph_mds_reply_info_parsed *info,
+ u64 features)
+{
+ u32 value_len;
+
+ ceph_decode_skip_8(p, end, bad); /* skip current version: 1 */
+ ceph_decode_skip_8(p, end, bad); /* skip first version: 1 */
+ ceph_decode_skip_32(p, end, bad); /* skip payload length */
+
+ ceph_decode_32_safe(p, end, value_len, bad);
+
+ if (value_len == end - *p) {
+ info->xattr_info.xattr_value = *p;
+ info->xattr_info.xattr_value_len = value_len;
+ *p = end;
+ return value_len;
+ }
+bad:
+ return -EIO;
+}
+
/*
* parse extra results
*/
@@ -570,6 +592,8 @@ static int parse_reply_info_extra(void **p, void *end,
return parse_reply_info_readdir(p, end, info, features);
else if (op == CEPH_MDS_OP_CREATE)
return parse_reply_info_create(p, end, info, features, s);
+ else if (op == CEPH_MDS_OP_GETVXATTR)
+ return parse_reply_info_getvxattr(p, end, info, features);
else
return -EIO;
}
@@ -2178,7 +2202,8 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
order = get_order(size * num_entries);
while (order >= 0) {
rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL |
- __GFP_NOWARN,
+ __GFP_NOWARN |
+ __GFP_ZERO,
order);
if (rinfo->dir_entries)
break;
@@ -2946,15 +2971,16 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
return err;
}
-static int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
- struct ceph_mds_request *req)
+int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req,
+ ceph_mds_request_wait_callback_t wait_func)
{
int err;
/* wait */
dout("do_request waiting\n");
- if (!req->r_timeout && req->r_wait_for_completion) {
- err = req->r_wait_for_completion(mdsc, req);
+ if (wait_func) {
+ err = wait_func(mdsc, req);
} else {
long timeleft = wait_for_completion_killable_timeout(
&req->r_completion,
@@ -3011,7 +3037,7 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
/* issue */
err = ceph_mdsc_submit_request(mdsc, dir, req);
if (!err)
- err = ceph_mdsc_wait_request(mdsc, req);
+ err = ceph_mdsc_wait_request(mdsc, req, NULL);
dout("do_request %p done, result %d\n", req, err);
return err;
}
@@ -3097,35 +3123,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
result = le32_to_cpu(head->result);
- /*
- * Handle an ESTALE
- * if we're not talking to the authority, send to them
- * if the authority has changed while we weren't looking,
- * send to new authority
- * Otherwise we just have to return an ESTALE
- */
- if (result == -ESTALE) {
- dout("got ESTALE on request %llu\n", req->r_tid);
- req->r_resend_mds = -1;
- if (req->r_direct_mode != USE_AUTH_MDS) {
- dout("not using auth, setting for that now\n");
- req->r_direct_mode = USE_AUTH_MDS;
- __do_request(mdsc, req);
- mutex_unlock(&mdsc->mutex);
- goto out;
- } else {
- int mds = __choose_mds(mdsc, req, NULL);
- if (mds >= 0 && mds != req->r_session->s_mds) {
- dout("but auth changed, so resending\n");
- __do_request(mdsc, req);
- mutex_unlock(&mdsc->mutex);
- goto out;
- }
- }
- dout("have to return ESTALE on request %llu\n", req->r_tid);
- }
-
-
if (head->safe) {
set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags);
__unregister_request(mdsc, req);
@@ -4437,8 +4434,6 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)
bool check_session_state(struct ceph_mds_session *s)
{
- struct ceph_fs_client *fsc = s->s_mdsc->fsc;
-
switch (s->s_state) {
case CEPH_MDS_SESSION_OPEN:
if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
@@ -4447,10 +4442,6 @@ bool check_session_state(struct ceph_mds_session *s)
}
break;
case CEPH_MDS_SESSION_CLOSING:
- /* Should never reach this when not force unmounting */
- WARN_ON_ONCE(s->s_ttl &&
- READ_ONCE(fsc->mount_state) != CEPH_MOUNT_SHUTDOWN);
- fallthrough;
case CEPH_MDS_SESSION_NEW:
case CEPH_MDS_SESSION_RESTARTING:
case CEPH_MDS_SESSION_CLOSED:
@@ -4841,7 +4832,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
mutex_unlock(&mdsc->mutex);
ceph_cleanup_snapid_map(mdsc);
- ceph_cleanup_empty_realms(mdsc);
+ ceph_cleanup_global_and_empty_realms(mdsc);
cancel_work_sync(&mdsc->cap_reclaim_work);
cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 97c7f7bfa55f..33497846e47e 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -100,6 +100,11 @@ struct ceph_mds_reply_dir_entry {
loff_t offset;
};
+struct ceph_mds_reply_xattr {
+ char *xattr_value;
+ size_t xattr_value_len;
+};
+
/*
* parsed info about an mds reply, including information about
* either: 1) the target inode and/or its parent directory and dentry,
@@ -115,6 +120,7 @@ struct ceph_mds_reply_info_parsed {
char *dname;
u32 dname_len;
struct ceph_mds_reply_lease *dlease;
+ struct ceph_mds_reply_xattr xattr_info;
/* extra */
union {
@@ -274,8 +280,8 @@ struct ceph_mds_request {
union ceph_mds_request_args r_args;
int r_fmode; /* file mode, if expecting cap */
- const struct cred *r_cred;
int r_request_release_offset;
+ const struct cred *r_cred;
struct timespec64 r_stamp;
/* for choosing which mds to send this request to */
@@ -296,12 +302,11 @@ struct ceph_mds_request {
struct ceph_msg *r_reply;
struct ceph_mds_reply_info_parsed r_reply_info;
int r_err;
-
+ u32 r_readdir_offset;
struct page *r_locked_page;
int r_dir_caps;
int r_num_caps;
- u32 r_readdir_offset;
unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */
unsigned long r_started; /* start time to measure timeout against */
@@ -329,7 +334,6 @@ struct ceph_mds_request {
struct completion r_completion;
struct completion r_safe_completion;
ceph_mds_request_callback_t r_callback;
- ceph_mds_request_wait_callback_t r_wait_for_completion;
struct list_head r_unsafe_item; /* per-session unsafe list item */
long long r_dir_release_cnt;
@@ -507,6 +511,9 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
extern int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
struct inode *dir,
struct ceph_mds_request *req);
+int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req,
+ ceph_mds_request_wait_callback_t wait_func);
extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
struct inode *dir,
struct ceph_mds_request *req);
diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c
index 0fcba68f9a99..c47347d2e84e 100644
--- a/fs/ceph/metric.c
+++ b/fs/ceph/metric.c
@@ -8,6 +8,12 @@
#include "metric.h"
#include "mds_client.h"
+static void ktime_to_ceph_timespec(struct ceph_timespec *ts, ktime_t val)
+{
+ struct timespec64 t = ktime_to_timespec64(val);
+ ceph_encode_timespec64(ts, &t);
+}
+
static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
struct ceph_mds_session *s)
{
@@ -26,7 +32,6 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
u64 nr_caps = atomic64_read(&m->total_caps);
u32 header_len = sizeof(struct ceph_metric_header);
struct ceph_msg *msg;
- struct timespec64 ts;
s64 sum;
s32 items = 0;
s32 len;
@@ -59,37 +64,40 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
/* encode the read latency metric */
read = (struct ceph_metric_read_latency *)(cap + 1);
read->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY);
- read->header.ver = 1;
+ read->header.ver = 2;
read->header.compat = 1;
read->header.data_len = cpu_to_le32(sizeof(*read) - header_len);
sum = m->metric[METRIC_READ].latency_sum;
- jiffies_to_timespec64(sum, &ts);
- read->sec = cpu_to_le32(ts.tv_sec);
- read->nsec = cpu_to_le32(ts.tv_nsec);
+ ktime_to_ceph_timespec(&read->lat, sum);
+ ktime_to_ceph_timespec(&read->avg, m->metric[METRIC_READ].latency_avg);
+ read->sq_sum = cpu_to_le64(m->metric[METRIC_READ].latency_sq_sum);
+ read->count = cpu_to_le64(m->metric[METRIC_READ].total);
items++;
/* encode the write latency metric */
write = (struct ceph_metric_write_latency *)(read + 1);
write->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY);
- write->header.ver = 1;
+ write->header.ver = 2;
write->header.compat = 1;
write->header.data_len = cpu_to_le32(sizeof(*write) - header_len);
sum = m->metric[METRIC_WRITE].latency_sum;
- jiffies_to_timespec64(sum, &ts);
- write->sec = cpu_to_le32(ts.tv_sec);
- write->nsec = cpu_to_le32(ts.tv_nsec);
+ ktime_to_ceph_timespec(&write->lat, sum);
+ ktime_to_ceph_timespec(&write->avg, m->metric[METRIC_WRITE].latency_avg);
+ write->sq_sum = cpu_to_le64(m->metric[METRIC_WRITE].latency_sq_sum);
+ write->count = cpu_to_le64(m->metric[METRIC_WRITE].total);
items++;
/* encode the metadata latency metric */
meta = (struct ceph_metric_metadata_latency *)(write + 1);
meta->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY);
- meta->header.ver = 1;
+ meta->header.ver = 2;
meta->header.compat = 1;
meta->header.data_len = cpu_to_le32(sizeof(*meta) - header_len);
sum = m->metric[METRIC_METADATA].latency_sum;
- jiffies_to_timespec64(sum, &ts);
- meta->sec = cpu_to_le32(ts.tv_sec);
- meta->nsec = cpu_to_le32(ts.tv_nsec);
+ ktime_to_ceph_timespec(&meta->lat, sum);
+ ktime_to_ceph_timespec(&meta->avg, m->metric[METRIC_METADATA].latency_avg);
+ meta->sq_sum = cpu_to_le64(m->metric[METRIC_METADATA].latency_sq_sum);
+ meta->count = cpu_to_le64(m->metric[METRIC_METADATA].total);
items++;
/* encode the dentry lease metric */
@@ -250,6 +258,7 @@ int ceph_metric_init(struct ceph_client_metric *m)
metric->size_max = 0;
metric->total = 0;
metric->latency_sum = 0;
+ metric->latency_avg = 0;
metric->latency_sq_sum = 0;
metric->latency_min = KTIME_MAX;
metric->latency_max = 0;
@@ -307,20 +316,19 @@ void ceph_metric_destroy(struct ceph_client_metric *m)
max = new; \
}
-static inline void __update_stdev(ktime_t total, ktime_t lsum,
- ktime_t *sq_sump, ktime_t lat)
+static inline void __update_mean_and_stdev(ktime_t total, ktime_t *lavg,
+ ktime_t *sq_sump, ktime_t lat)
{
- ktime_t avg, sq;
-
- if (unlikely(total == 1))
- return;
-
- /* the sq is (lat - old_avg) * (lat - new_avg) */
- avg = DIV64_U64_ROUND_CLOSEST((lsum - lat), (total - 1));
- sq = lat - avg;
- avg = DIV64_U64_ROUND_CLOSEST(lsum, total);
- sq = sq * (lat - avg);
- *sq_sump += sq;
+ ktime_t avg;
+
+ if (unlikely(total == 1)) {
+ *lavg = lat;
+ } else {
+ /* the sq is (lat - old_avg) * (lat - new_avg) */
+ avg = *lavg + div64_s64(lat - *lavg, total);
+ *sq_sump += (lat - *lavg)*(lat - avg);
+ *lavg = avg;
+ }
}
void ceph_update_metrics(struct ceph_metric *m,
@@ -339,6 +347,7 @@ void ceph_update_metrics(struct ceph_metric *m,
METRIC_UPDATE_MIN_MAX(m->size_min, m->size_max, size);
m->latency_sum += lat;
METRIC_UPDATE_MIN_MAX(m->latency_min, m->latency_max, lat);
- __update_stdev(total, m->latency_sum, &m->latency_sq_sum, lat);
+ __update_mean_and_stdev(total, &m->latency_avg, &m->latency_sq_sum,
+ lat);
spin_unlock(&m->lock);
}
diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h
index bb45608181e7..0d0c44bd3332 100644
--- a/fs/ceph/metric.h
+++ b/fs/ceph/metric.h
@@ -2,7 +2,7 @@
#ifndef _FS_CEPH_MDS_METRIC_H
#define _FS_CEPH_MDS_METRIC_H
-#include <linux/types.h>
+#include <linux/ceph/types.h>
#include <linux/percpu_counter.h>
#include <linux/ktime.h>
@@ -19,27 +19,39 @@ enum ceph_metric_type {
CLIENT_METRIC_TYPE_OPENED_INODES,
CLIENT_METRIC_TYPE_READ_IO_SIZES,
CLIENT_METRIC_TYPE_WRITE_IO_SIZES,
-
- CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_WRITE_IO_SIZES,
+ CLIENT_METRIC_TYPE_AVG_READ_LATENCY,
+ CLIENT_METRIC_TYPE_STDEV_READ_LATENCY,
+ CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY,
+ CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY,
+ CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY,
+ CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY,
+
+ CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY,
};
/*
* This will always have the highest metric bit value
* as the last element of the array.
*/
-#define CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED { \
- CLIENT_METRIC_TYPE_CAP_INFO, \
- CLIENT_METRIC_TYPE_READ_LATENCY, \
- CLIENT_METRIC_TYPE_WRITE_LATENCY, \
- CLIENT_METRIC_TYPE_METADATA_LATENCY, \
- CLIENT_METRIC_TYPE_DENTRY_LEASE, \
- CLIENT_METRIC_TYPE_OPENED_FILES, \
- CLIENT_METRIC_TYPE_PINNED_ICAPS, \
- CLIENT_METRIC_TYPE_OPENED_INODES, \
- CLIENT_METRIC_TYPE_READ_IO_SIZES, \
- CLIENT_METRIC_TYPE_WRITE_IO_SIZES, \
- \
- CLIENT_METRIC_TYPE_MAX, \
+#define CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED { \
+ CLIENT_METRIC_TYPE_CAP_INFO, \
+ CLIENT_METRIC_TYPE_READ_LATENCY, \
+ CLIENT_METRIC_TYPE_WRITE_LATENCY, \
+ CLIENT_METRIC_TYPE_METADATA_LATENCY, \
+ CLIENT_METRIC_TYPE_DENTRY_LEASE, \
+ CLIENT_METRIC_TYPE_OPENED_FILES, \
+ CLIENT_METRIC_TYPE_PINNED_ICAPS, \
+ CLIENT_METRIC_TYPE_OPENED_INODES, \
+ CLIENT_METRIC_TYPE_READ_IO_SIZES, \
+ CLIENT_METRIC_TYPE_WRITE_IO_SIZES, \
+ CLIENT_METRIC_TYPE_AVG_READ_LATENCY, \
+ CLIENT_METRIC_TYPE_STDEV_READ_LATENCY, \
+ CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY, \
+ CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY, \
+ CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY, \
+ CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, \
+ \
+ CLIENT_METRIC_TYPE_MAX, \
}
struct ceph_metric_header {
@@ -60,22 +72,28 @@ struct ceph_metric_cap {
/* metric read latency header */
struct ceph_metric_read_latency {
struct ceph_metric_header header;
- __le32 sec;
- __le32 nsec;
+ struct ceph_timespec lat;
+ struct ceph_timespec avg;
+ __le64 sq_sum;
+ __le64 count;
} __packed;
/* metric write latency header */
struct ceph_metric_write_latency {
struct ceph_metric_header header;
- __le32 sec;
- __le32 nsec;
+ struct ceph_timespec lat;
+ struct ceph_timespec avg;
+ __le64 sq_sum;
+ __le64 count;
} __packed;
/* metric metadata latency header */
struct ceph_metric_metadata_latency {
struct ceph_metric_header header;
- __le32 sec;
- __le32 nsec;
+ struct ceph_timespec lat;
+ struct ceph_timespec avg;
+ __le64 sq_sum;
+ __le64 count;
} __packed;
/* metric dentry lease header */
@@ -140,6 +158,7 @@ struct ceph_metric {
u64 size_min;
u64 size_max;
ktime_t latency_sum;
+ ktime_t latency_avg;
ktime_t latency_sq_sum;
ktime_t latency_min;
ktime_t latency_max;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index b41e6724c591..322ee5add942 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -121,18 +121,23 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
if (!realm)
return ERR_PTR(-ENOMEM);
- atomic_set(&realm->nref, 1); /* for caller */
+ /* Do not release the global dummy snaprealm until unmouting */
+ if (ino == CEPH_INO_GLOBAL_SNAPREALM)
+ atomic_set(&realm->nref, 2);
+ else
+ atomic_set(&realm->nref, 1);
realm->ino = ino;
INIT_LIST_HEAD(&realm->children);
INIT_LIST_HEAD(&realm->child_item);
INIT_LIST_HEAD(&realm->empty_item);
INIT_LIST_HEAD(&realm->dirty_item);
+ INIT_LIST_HEAD(&realm->rebuild_item);
INIT_LIST_HEAD(&realm->inodes_with_caps);
spin_lock_init(&realm->inodes_with_caps_lock);
__insert_snap_realm(&mdsc->snap_realms, realm);
mdsc->num_snap_realms++;
- dout("create_snap_realm %llx %p\n", realm->ino, realm);
+ dout("%s %llx %p\n", __func__, realm->ino, realm);
return realm;
}
@@ -156,7 +161,7 @@ static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc,
else if (ino > r->ino)
n = n->rb_right;
else {
- dout("lookup_snap_realm %llx %p\n", r->ino, r);
+ dout("%s %llx %p\n", __func__, r->ino, r);
return r;
}
}
@@ -184,7 +189,7 @@ static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
{
lockdep_assert_held_write(&mdsc->snap_rwsem);
- dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
+ dout("%s %p %llx\n", __func__, realm, realm->ino);
rb_erase(&realm->node, &mdsc->snap_realms);
mdsc->num_snap_realms--;
@@ -260,9 +265,14 @@ static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
spin_unlock(&mdsc->snap_empty_lock);
}
-void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc)
+void ceph_cleanup_global_and_empty_realms(struct ceph_mds_client *mdsc)
{
+ struct ceph_snap_realm *global_realm;
+
down_write(&mdsc->snap_rwsem);
+ global_realm = __lookup_snap_realm(mdsc, CEPH_INO_GLOBAL_SNAPREALM);
+ if (global_realm)
+ ceph_put_snap_realm(mdsc, global_realm);
__cleanup_empty_realms(mdsc);
up_write(&mdsc->snap_rwsem);
}
@@ -292,9 +302,8 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
if (IS_ERR(parent))
return PTR_ERR(parent);
}
- dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n",
- realm->ino, realm, realm->parent_ino, realm->parent,
- parentino, parent);
+ dout("%s %llx %p: %llx %p -> %llx %p\n", __func__, realm->ino,
+ realm, realm->parent_ino, realm->parent, parentino, parent);
if (realm->parent) {
list_del_init(&realm->child_item);
ceph_put_snap_realm(mdsc, realm->parent);
@@ -320,7 +329,8 @@ static int cmpu64_rev(const void *a, const void *b)
* build the snap context for a given realm.
*/
static int build_snap_context(struct ceph_snap_realm *realm,
- struct list_head* dirty_realms)
+ struct list_head *realm_queue,
+ struct list_head *dirty_realms)
{
struct ceph_snap_realm *parent = realm->parent;
struct ceph_snap_context *snapc;
@@ -334,9 +344,9 @@ static int build_snap_context(struct ceph_snap_realm *realm,
*/
if (parent) {
if (!parent->cached_context) {
- err = build_snap_context(parent, dirty_realms);
- if (err)
- goto fail;
+ /* add to the queue head */
+ list_add(&parent->rebuild_item, realm_queue);
+ return 1;
}
num += parent->cached_context->num_snaps;
}
@@ -349,9 +359,8 @@ static int build_snap_context(struct ceph_snap_realm *realm,
realm->cached_context->seq == realm->seq &&
(!parent ||
realm->cached_context->seq >= parent->cached_context->seq)) {
- dout("build_snap_context %llx %p: %p seq %lld (%u snaps)"
- " (unchanged)\n",
- realm->ino, realm, realm->cached_context,
+ dout("%s %llx %p: %p seq %lld (%u snaps) (unchanged)\n",
+ __func__, realm->ino, realm, realm->cached_context,
realm->cached_context->seq,
(unsigned int)realm->cached_context->num_snaps);
return 0;
@@ -390,9 +399,8 @@ static int build_snap_context(struct ceph_snap_realm *realm,
sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
snapc->num_snaps = num;
- dout("build_snap_context %llx %p: %p seq %lld (%u snaps)\n",
- realm->ino, realm, snapc, snapc->seq,
- (unsigned int) snapc->num_snaps);
+ dout("%s %llx %p: %p seq %lld (%u snaps)\n", __func__, realm->ino,
+ realm, snapc, snapc->seq, (unsigned int) snapc->num_snaps);
ceph_put_snap_context(realm->cached_context);
realm->cached_context = snapc;
@@ -409,8 +417,7 @@ fail:
ceph_put_snap_context(realm->cached_context);
realm->cached_context = NULL;
}
- pr_err("build_snap_context %llx %p fail %d\n", realm->ino,
- realm, err);
+ pr_err("%s %llx %p fail %d\n", __func__, realm->ino, realm, err);
return err;
}
@@ -420,13 +427,50 @@ fail:
static void rebuild_snap_realms(struct ceph_snap_realm *realm,
struct list_head *dirty_realms)
{
- struct ceph_snap_realm *child;
+ LIST_HEAD(realm_queue);
+ int last = 0;
+ bool skip = false;
+
+ list_add_tail(&realm->rebuild_item, &realm_queue);
+
+ while (!list_empty(&realm_queue)) {
+ struct ceph_snap_realm *_realm, *child;
+
+ _realm = list_first_entry(&realm_queue,
+ struct ceph_snap_realm,
+ rebuild_item);
+
+ /*
+ * If the last building failed dues to memory
+ * issue, just empty the realm_queue and return
+ * to avoid infinite loop.
+ */
+ if (last < 0) {
+ list_del_init(&_realm->rebuild_item);
+ continue;
+ }
+
+ last = build_snap_context(_realm, &realm_queue, dirty_realms);
+ dout("%s %llx %p, %s\n", __func__, _realm->ino, _realm,
+ last > 0 ? "is deferred" : !last ? "succeeded" : "failed");
+
+ /* is any child in the list ? */
+ list_for_each_entry(child, &_realm->children, child_item) {
+ if (!list_empty(&child->rebuild_item)) {
+ skip = true;
+ break;
+ }
+ }
- dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
- build_snap_context(realm, dirty_realms);
+ if (!skip) {
+ list_for_each_entry(child, &_realm->children, child_item)
+ list_add_tail(&child->rebuild_item, &realm_queue);
+ }
- list_for_each_entry(child, &realm->children, child_item)
- rebuild_snap_realms(child, dirty_realms);
+ /* last == 1 means need to build parent first */
+ if (last <= 0)
+ list_del_init(&_realm->rebuild_item);
+ }
}
@@ -474,23 +518,15 @@ static bool has_new_snaps(struct ceph_snap_context *o,
* Caller must hold snap_rwsem for read (i.e., the realm topology won't
* change).
*/
-static void ceph_queue_cap_snap(struct ceph_inode_info *ci)
+static void ceph_queue_cap_snap(struct ceph_inode_info *ci,
+ struct ceph_cap_snap **pcapsnap)
{
struct inode *inode = &ci->vfs_inode;
- struct ceph_cap_snap *capsnap;
struct ceph_snap_context *old_snapc, *new_snapc;
+ struct ceph_cap_snap *capsnap = *pcapsnap;
struct ceph_buffer *old_blob = NULL;
int used, dirty;
- capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
- if (!capsnap) {
- pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
- return;
- }
- capsnap->cap_flush.is_capsnap = true;
- INIT_LIST_HEAD(&capsnap->cap_flush.i_list);
- INIT_LIST_HEAD(&capsnap->cap_flush.g_list);
-
spin_lock(&ci->i_ceph_lock);
used = __ceph_caps_used(ci);
dirty = __ceph_caps_dirty(ci);
@@ -511,12 +547,14 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci)
as no new writes are allowed to start when pending, so any
writes in progress now were started before the previous
cap_snap. lucky us. */
- dout("queue_cap_snap %p already pending\n", inode);
+ dout("%s %p %llx.%llx already pending\n",
+ __func__, inode, ceph_vinop(inode));
goto update_snapc;
}
if (ci->i_wrbuffer_ref_head == 0 &&
!(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) {
- dout("queue_cap_snap %p nothing dirty|writing\n", inode);
+ dout("%s %p %llx.%llx nothing dirty|writing\n",
+ __func__, inode, ceph_vinop(inode));
goto update_snapc;
}
@@ -536,20 +574,17 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci)
} else {
if (!(used & CEPH_CAP_FILE_WR) &&
ci->i_wrbuffer_ref_head == 0) {
- dout("queue_cap_snap %p "
- "no new_snap|dirty_page|writing\n", inode);
+ dout("%s %p %llx.%llx no new_snap|dirty_page|writing\n",
+ __func__, inode, ceph_vinop(inode));
goto update_snapc;
}
}
- dout("queue_cap_snap %p cap_snap %p queuing under %p %s %s\n",
- inode, capsnap, old_snapc, ceph_cap_string(dirty),
- capsnap->need_flush ? "" : "no_flush");
+ dout("%s %p %llx.%llx cap_snap %p queuing under %p %s %s\n",
+ __func__, inode, ceph_vinop(inode), capsnap, old_snapc,
+ ceph_cap_string(dirty), capsnap->need_flush ? "" : "no_flush");
ihold(inode);
- refcount_set(&capsnap->nref, 1);
- INIT_LIST_HEAD(&capsnap->ci_item);
-
capsnap->follows = old_snapc->seq;
capsnap->issued = __ceph_caps_issued(ci, NULL);
capsnap->dirty = dirty;
@@ -579,31 +614,30 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci)
list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
if (used & CEPH_CAP_FILE_WR) {
- dout("queue_cap_snap %p cap_snap %p snapc %p"
- " seq %llu used WR, now pending\n", inode,
+ dout("%s %p %llx.%llx cap_snap %p snapc %p seq %llu used WR,"
+ " now pending\n", __func__, inode, ceph_vinop(inode),
capsnap, old_snapc, old_snapc->seq);
capsnap->writing = 1;
} else {
/* note mtime, size NOW. */
__ceph_finish_cap_snap(ci, capsnap);
}
- capsnap = NULL;
+ *pcapsnap = NULL;
old_snapc = NULL;
update_snapc:
- if (ci->i_wrbuffer_ref_head == 0 &&
- ci->i_wr_ref == 0 &&
- ci->i_dirty_caps == 0 &&
- ci->i_flushing_caps == 0) {
- ci->i_head_snapc = NULL;
- } else {
+ if (ci->i_wrbuffer_ref_head == 0 &&
+ ci->i_wr_ref == 0 &&
+ ci->i_dirty_caps == 0 &&
+ ci->i_flushing_caps == 0) {
+ ci->i_head_snapc = NULL;
+ } else {
ci->i_head_snapc = ceph_get_snap_context(new_snapc);
dout(" new snapc is %p\n", new_snapc);
}
spin_unlock(&ci->i_ceph_lock);
ceph_buffer_put(old_blob);
- kfree(capsnap);
ceph_put_snap_context(old_snapc);
}
@@ -632,27 +666,28 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
capsnap->truncate_size = ci->i_truncate_size;
capsnap->truncate_seq = ci->i_truncate_seq;
if (capsnap->dirty_pages) {
- dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu "
- "still has %d dirty pages\n", inode, capsnap,
- capsnap->context, capsnap->context->seq,
- ceph_cap_string(capsnap->dirty), capsnap->size,
- capsnap->dirty_pages);
+ dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu "
+ "still has %d dirty pages\n", __func__, inode,
+ ceph_vinop(inode), capsnap, capsnap->context,
+ capsnap->context->seq, ceph_cap_string(capsnap->dirty),
+ capsnap->size, capsnap->dirty_pages);
return 0;
}
/* Fb cap still in use, delay it */
if (ci->i_wb_ref) {
- dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu "
- "used WRBUFFER, delaying\n", inode, capsnap,
- capsnap->context, capsnap->context->seq,
- ceph_cap_string(capsnap->dirty), capsnap->size);
+ dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu "
+ "used WRBUFFER, delaying\n", __func__, inode,
+ ceph_vinop(inode), capsnap, capsnap->context,
+ capsnap->context->seq, ceph_cap_string(capsnap->dirty),
+ capsnap->size);
capsnap->writing = 1;
return 0;
}
ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
- dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n",
- inode, capsnap, capsnap->context,
+ dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu\n",
+ __func__, inode, ceph_vinop(inode), capsnap, capsnap->context,
capsnap->context->seq, ceph_cap_string(capsnap->dirty),
capsnap->size);
@@ -671,8 +706,9 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
{
struct ceph_inode_info *ci;
struct inode *lastinode = NULL;
+ struct ceph_cap_snap *capsnap = NULL;
- dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino);
+ dout("%s %p %llx inode\n", __func__, realm, realm->ino);
spin_lock(&realm->inodes_with_caps_lock);
list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) {
@@ -682,13 +718,35 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
spin_unlock(&realm->inodes_with_caps_lock);
iput(lastinode);
lastinode = inode;
- ceph_queue_cap_snap(ci);
+
+ /*
+ * Allocate the capsnap memory outside of ceph_queue_cap_snap()
+ * to reduce very possible but unnecessary frequently memory
+ * allocate/free in this loop.
+ */
+ if (!capsnap) {
+ capsnap = kmem_cache_zalloc(ceph_cap_snap_cachep, GFP_NOFS);
+ if (!capsnap) {
+ pr_err("ENOMEM allocating ceph_cap_snap on %p\n",
+ inode);
+ return;
+ }
+ }
+ capsnap->cap_flush.is_capsnap = true;
+ refcount_set(&capsnap->nref, 1);
+ INIT_LIST_HEAD(&capsnap->cap_flush.i_list);
+ INIT_LIST_HEAD(&capsnap->cap_flush.g_list);
+ INIT_LIST_HEAD(&capsnap->ci_item);
+
+ ceph_queue_cap_snap(ci, &capsnap);
spin_lock(&realm->inodes_with_caps_lock);
}
spin_unlock(&realm->inodes_with_caps_lock);
iput(lastinode);
- dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
+ if (capsnap)
+ kmem_cache_free(ceph_cap_snap_cachep, capsnap);
+ dout("%s %p %llx done\n", __func__, realm, realm->ino);
}
/*
@@ -707,14 +765,16 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
__le64 *prior_parent_snaps; /* encoded */
struct ceph_snap_realm *realm = NULL;
struct ceph_snap_realm *first_realm = NULL;
- int invalidate = 0;
+ struct ceph_snap_realm *realm_to_rebuild = NULL;
+ int rebuild_snapcs;
int err = -ENOMEM;
LIST_HEAD(dirty_realms);
lockdep_assert_held_write(&mdsc->snap_rwsem);
- dout("update_snap_trace deletion=%d\n", deletion);
+ dout("%s deletion=%d\n", __func__, deletion);
more:
+ rebuild_snapcs = 0;
ceph_decode_need(&p, e, sizeof(*ri), bad);
ri = p;
p += sizeof(*ri);
@@ -738,10 +798,10 @@ more:
err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
if (err < 0)
goto fail;
- invalidate += err;
+ rebuild_snapcs += err;
if (le64_to_cpu(ri->seq) > realm->seq) {
- dout("update_snap_trace updating %llx %p %lld -> %lld\n",
+ dout("%s updating %llx %p %lld -> %lld\n", __func__,
realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
/* update realm parameters, snap lists */
realm->seq = le64_to_cpu(ri->seq);
@@ -763,22 +823,30 @@ more:
if (realm->seq > mdsc->last_snap_seq)
mdsc->last_snap_seq = realm->seq;
- invalidate = 1;
+ rebuild_snapcs = 1;
} else if (!realm->cached_context) {
- dout("update_snap_trace %llx %p seq %lld new\n",
+ dout("%s %llx %p seq %lld new\n", __func__,
realm->ino, realm, realm->seq);
- invalidate = 1;
+ rebuild_snapcs = 1;
} else {
- dout("update_snap_trace %llx %p seq %lld unchanged\n",
+ dout("%s %llx %p seq %lld unchanged\n", __func__,
realm->ino, realm, realm->seq);
}
- dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
- realm, invalidate, p, e);
+ dout("done with %llx %p, rebuild_snapcs=%d, %p %p\n", realm->ino,
+ realm, rebuild_snapcs, p, e);
+
+ /*
+ * this will always track the uppest parent realm from which
+ * we need to rebuild the snapshot contexts _downward_ in
+ * hierarchy.
+ */
+ if (rebuild_snapcs)
+ realm_to_rebuild = realm;
- /* invalidate when we reach the _end_ (root) of the trace */
- if (invalidate && p >= e)
- rebuild_snap_realms(realm, &dirty_realms);
+ /* rebuild_snapcs when we reach the _end_ (root) of the trace */
+ if (realm_to_rebuild && p >= e)
+ rebuild_snap_realms(realm_to_rebuild, &dirty_realms);
if (!first_realm)
first_realm = realm;
@@ -814,7 +882,7 @@ fail:
ceph_put_snap_realm(mdsc, realm);
if (first_realm)
ceph_put_snap_realm(mdsc, first_realm);
- pr_err("update_snap_trace error %d\n", err);
+ pr_err("%s error %d\n", __func__, err);
return err;
}
@@ -831,7 +899,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
struct inode *inode;
struct ceph_mds_session *session = NULL;
- dout("flush_snaps\n");
+ dout("%s\n", __func__);
spin_lock(&mdsc->snap_flush_lock);
while (!list_empty(&mdsc->snap_flush_list)) {
ci = list_first_entry(&mdsc->snap_flush_list,
@@ -846,7 +914,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
spin_unlock(&mdsc->snap_flush_lock);
ceph_put_mds_session(session);
- dout("flush_snaps done\n");
+ dout("%s done\n", __func__);
}
/**
@@ -928,8 +996,8 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
trace_len = le32_to_cpu(h->trace_len);
p += sizeof(*h);
- dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
- ceph_snap_op_name(op), split, trace_len);
+ dout("%s from mds%d op %s split %llx tracelen %d\n", __func__,
+ mds, ceph_snap_op_name(op), split, trace_len);
mutex_lock(&session->s_mutex);
inc_session_sequence(session);
@@ -989,13 +1057,13 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
*/
if (ci->i_snap_realm->created >
le64_to_cpu(ri->created)) {
- dout(" leaving %p in newer realm %llx %p\n",
- inode, ci->i_snap_realm->ino,
+ dout(" leaving %p %llx.%llx in newer realm %llx %p\n",
+ inode, ceph_vinop(inode), ci->i_snap_realm->ino,
ci->i_snap_realm);
goto skip_inode;
}
- dout(" will move %p to split realm %llx %p\n",
- inode, realm->ino, realm);
+ dout(" will move %p %llx.%llx to split realm %llx %p\n",
+ inode, ceph_vinop(inode), realm->ino, realm);
ceph_get_snap_realm(mdsc, realm);
ceph_change_snap_realm(inode, realm);
@@ -1038,7 +1106,7 @@ skip_inode:
return;
bad:
- pr_err("corrupt snap message from mds%d\n", mds);
+ pr_err("%s corrupt snap message from mds%d\n", __func__, mds);
ceph_msg_dump(msg);
out:
if (locked_rwsem)
@@ -1071,7 +1139,8 @@ struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
}
spin_unlock(&mdsc->snapid_map_lock);
if (exist) {
- dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
+ dout("%s found snapid map %llx -> %x\n", __func__,
+ exist->snap, exist->dev);
return exist;
}
@@ -1115,11 +1184,13 @@ struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
if (exist) {
free_anon_bdev(sm->dev);
kfree(sm);
- dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
+ dout("%s found snapid map %llx -> %x\n", __func__,
+ exist->snap, exist->dev);
return exist;
}
- dout("create snapid map %llx -> %x\n", sm->snap, sm->dev);
+ dout("%s create snapid map %llx -> %x\n", __func__,
+ sm->snap, sm->dev);
return sm;
}
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
index 573bb9556fb5..e36e8948e728 100644
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@@ -60,6 +60,7 @@ const char *ceph_mds_op_name(int op)
case CEPH_MDS_OP_LOOKUPINO: return "lookupino";
case CEPH_MDS_OP_LOOKUPNAME: return "lookupname";
case CEPH_MDS_OP_GETATTR: return "getattr";
+ case CEPH_MDS_OP_GETVXATTR: return "getvxattr";
case CEPH_MDS_OP_SETXATTR: return "setxattr";
case CEPH_MDS_OP_SETATTR: return "setattr";
case CEPH_MDS_OP_RMXATTR: return "rmxattr";
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index bf79f369aec6..e6987d295079 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -802,6 +802,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
fsc->have_copy_from2 = true;
atomic_long_set(&fsc->writeback_count, 0);
+ fsc->write_congested = false;
err = -ENOMEM;
/*
@@ -864,6 +865,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
*/
struct kmem_cache *ceph_inode_cachep;
struct kmem_cache *ceph_cap_cachep;
+struct kmem_cache *ceph_cap_snap_cachep;
struct kmem_cache *ceph_cap_flush_cachep;
struct kmem_cache *ceph_dentry_cachep;
struct kmem_cache *ceph_file_cachep;
@@ -892,6 +894,9 @@ static int __init init_caches(void)
ceph_cap_cachep = KMEM_CACHE(ceph_cap, SLAB_MEM_SPREAD);
if (!ceph_cap_cachep)
goto bad_cap;
+ ceph_cap_snap_cachep = KMEM_CACHE(ceph_cap_snap, SLAB_MEM_SPREAD);
+ if (!ceph_cap_snap_cachep)
+ goto bad_cap_snap;
ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush,
SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
if (!ceph_cap_flush_cachep)
@@ -931,6 +936,8 @@ bad_file:
bad_dentry:
kmem_cache_destroy(ceph_cap_flush_cachep);
bad_cap_flush:
+ kmem_cache_destroy(ceph_cap_snap_cachep);
+bad_cap_snap:
kmem_cache_destroy(ceph_cap_cachep);
bad_cap:
kmem_cache_destroy(ceph_inode_cachep);
@@ -947,6 +954,7 @@ static void destroy_caches(void)
kmem_cache_destroy(ceph_inode_cachep);
kmem_cache_destroy(ceph_cap_cachep);
+ kmem_cache_destroy(ceph_cap_snap_cachep);
kmem_cache_destroy(ceph_cap_flush_cachep);
kmem_cache_destroy(ceph_dentry_cachep);
kmem_cache_destroy(ceph_file_cachep);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 67f145e1ae7a..20ceab74e871 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -17,13 +17,11 @@
#include <linux/posix_acl.h>
#include <linux/refcount.h>
#include <linux/security.h>
+#include <linux/netfs.h>
+#include <linux/fscache.h>
#include <linux/ceph/libceph.h>
-#ifdef CONFIG_CEPH_FSCACHE
-#include <linux/fscache.h>
-#endif
-
/* large granularity for statfs utilization stats to facilitate
* large volume sizes on 32-bit machines. */
#define CEPH_BLOCK_SHIFT 22 /* 4 MB */
@@ -121,6 +119,7 @@ struct ceph_fs_client {
struct ceph_mds_client *mdsc;
atomic_long_t writeback_count;
+ bool write_congested;
struct workqueue_struct *inode_wq;
struct workqueue_struct *cap_wq;
@@ -230,7 +229,7 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
if (refcount_dec_and_test(&capsnap->nref)) {
if (capsnap->xattr_blob)
ceph_buffer_put(capsnap->xattr_blob);
- kfree(capsnap);
+ kmem_cache_free(ceph_cap_snap_cachep, capsnap);
}
}
@@ -317,6 +316,11 @@ struct ceph_inode_xattrs_info {
* Ceph inode.
*/
struct ceph_inode_info {
+ struct {
+ /* These must be contiguous */
+ struct inode vfs_inode;
+ struct netfs_i_context netfs_ctx; /* Netfslib context */
+ };
struct ceph_vino i_vino; /* ceph ino + snap */
spinlock_t i_ceph_lock;
@@ -427,11 +431,6 @@ struct ceph_inode_info {
struct work_struct i_work;
unsigned long i_work_mask;
-
-#ifdef CONFIG_CEPH_FSCACHE
- struct fscache_cookie *fscache;
-#endif
- struct inode vfs_inode; /* at end */
};
static inline struct ceph_inode_info *
@@ -883,6 +882,8 @@ struct ceph_snap_realm {
struct list_head dirty_item; /* if realm needs new context */
+ struct list_head rebuild_item; /* rebuild snap realms _downward_ in hierarchy */
+
/* the current set of snaps for this realm */
struct ceph_snap_context *cached_context;
@@ -938,7 +939,7 @@ extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
struct ceph_msg *msg);
extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap);
-extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
+extern void ceph_cleanup_global_and_empty_realms(struct ceph_mds_client *mdsc);
extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc,
u64 snap);
@@ -1048,6 +1049,7 @@ static inline bool ceph_inode_is_shutdown(struct inode *inode)
/* xattr.c */
int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int);
+int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, size_t size);
ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci);
@@ -1212,8 +1214,9 @@ extern void __ceph_touch_fmode(struct ceph_inode_info *ci,
/* addr.c */
extern const struct address_space_operations ceph_aops;
+extern const struct netfs_request_ops ceph_netfs_ops;
extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
-extern int ceph_uninline_data(struct file *filp, struct page *locked_page);
+extern int ceph_uninline_data(struct file *file);
extern int ceph_pool_perm_check(struct inode *inode, int need);
extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index fcf7dfdecf96..afec84088471 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -923,10 +923,13 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_inode_xattr *xattr;
- struct ceph_vxattr *vxattr = NULL;
+ struct ceph_vxattr *vxattr;
int req_mask;
ssize_t err;
+ if (strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
+ goto handle_non_vxattrs;
+
/* let's see if a virtual xattr was requested */
vxattr = ceph_match_vxattr(inode, name);
if (vxattr) {
@@ -945,8 +948,14 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
err = -ERANGE;
}
return err;
+ } else {
+ err = ceph_do_getvxattr(inode, name, value, size);
+ /* this would happen with a new client and old server combo */
+ if (err == -EOPNOTSUPP)
+ err = -ENODATA;
+ return err;
}
-
+handle_non_vxattrs:
req_mask = __get_request_mask(inode);
spin_lock(&ci->i_ceph_lock);
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index ea00e1a91250..9d334816eac0 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -94,7 +94,7 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon)
le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics),
le32_to_cpu(tcon->fsAttrInfo.Attributes),
le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength),
- tcon->tidStatus);
+ tcon->status);
if (dev_type == FILE_DEVICE_DISK)
seq_puts(m, " type: DISK ");
else if (dev_type == FILE_DEVICE_CD_ROM)
diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c
index cdce1609c5c2..180c234c2f46 100644
--- a/fs/cifs/cifs_swn.c
+++ b/fs/cifs/cifs_swn.c
@@ -396,11 +396,11 @@ static int cifs_swn_resource_state_changed(struct cifs_swn_reg *swnreg, const ch
switch (state) {
case CIFS_SWN_RESOURCE_STATE_UNAVAILABLE:
cifs_dbg(FYI, "%s: resource name '%s' become unavailable\n", __func__, name);
- cifs_mark_tcp_ses_conns_for_reconnect(swnreg->tcon->ses->server, true);
+ cifs_signal_cifsd_for_reconnect(swnreg->tcon->ses->server, true);
break;
case CIFS_SWN_RESOURCE_STATE_AVAILABLE:
cifs_dbg(FYI, "%s: resource name '%s' become available\n", __func__, name);
- cifs_mark_tcp_ses_conns_for_reconnect(swnreg->tcon->ses->server, true);
+ cifs_signal_cifsd_for_reconnect(swnreg->tcon->ses->server, true);
break;
case CIFS_SWN_RESOURCE_STATE_UNKNOWN:
cifs_dbg(FYI, "%s: resource name '%s' changed to unknown state\n", __func__, name);
@@ -498,7 +498,7 @@ static int cifs_swn_reconnect(struct cifs_tcon *tcon, struct sockaddr_storage *a
goto unlock;
}
- cifs_mark_tcp_ses_conns_for_reconnect(tcon->ses->server, false);
+ cifs_signal_cifsd_for_reconnect(tcon->ses->server, false);
unlock:
mutex_unlock(&tcon->ses->server->srv_mutex);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 082c21478686..2b1a1c029c75 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -210,6 +210,9 @@ cifs_read_super(struct super_block *sb)
if (rc)
goto out_no_root;
/* tune readahead according to rsize if readahead size not set on mount */
+ if (cifs_sb->ctx->rsize == 0)
+ cifs_sb->ctx->rsize =
+ tcon->ses->server->ops->negotiate_rsize(tcon, cifs_sb->ctx);
if (cifs_sb->ctx->rasize)
sb->s_bdi->ra_pages = cifs_sb->ctx->rasize / PAGE_SIZE;
else
@@ -254,26 +257,33 @@ static void cifs_kill_sb(struct super_block *sb)
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct cifs_tcon *tcon;
struct cached_fid *cfid;
+ struct rb_root *root = &cifs_sb->tlink_tree;
+ struct rb_node *node;
+ struct tcon_link *tlink;
/*
* We ned to release all dentries for the cached directories
* before we kill the sb.
*/
if (cifs_sb->root) {
+ for (node = rb_first(root); node; node = rb_next(node)) {
+ tlink = rb_entry(node, struct tcon_link, tl_rbnode);
+ tcon = tlink_tcon(tlink);
+ if (IS_ERR(tcon))
+ continue;
+ cfid = &tcon->crfid;
+ mutex_lock(&cfid->fid_mutex);
+ if (cfid->dentry) {
+ dput(cfid->dentry);
+ cfid->dentry = NULL;
+ }
+ mutex_unlock(&cfid->fid_mutex);
+ }
+
+ /* finally release root dentry */
dput(cifs_sb->root);
cifs_sb->root = NULL;
}
- tcon = cifs_sb_master_tcon(cifs_sb);
- if (tcon) {
- cfid = &tcon->crfid;
- mutex_lock(&cfid->fid_mutex);
- if (cfid->dentry) {
-
- dput(cfid->dentry);
- cfid->dentry = NULL;
- }
- mutex_unlock(&cfid->fid_mutex);
- }
kill_anon_super(sb);
cifs_umount(cifs_sb);
@@ -354,7 +364,7 @@ static struct inode *
cifs_alloc_inode(struct super_block *sb)
{
struct cifsInodeInfo *cifs_inode;
- cifs_inode = kmem_cache_alloc(cifs_inode_cachep, GFP_KERNEL);
+ cifs_inode = alloc_inode_sb(sb, cifs_inode_cachep, GFP_KERNEL);
if (!cifs_inode)
return NULL;
cifs_inode->cifsAttrs = 0x20; /* default */
@@ -691,14 +701,14 @@ static void cifs_umount_begin(struct super_block *sb)
tcon = cifs_sb_master_tcon(cifs_sb);
spin_lock(&cifs_tcp_ses_lock);
- if ((tcon->tc_count > 1) || (tcon->tidStatus == CifsExiting)) {
+ if ((tcon->tc_count > 1) || (tcon->status == TID_EXITING)) {
/* we have other mounts to same share or we have
already tried to force umount this and woken up
all waiting network requests, nothing to do */
spin_unlock(&cifs_tcp_ses_lock);
return;
} else if (tcon->tc_count == 1)
- tcon->tidStatus = CifsExiting;
+ tcon->status = TID_EXITING;
spin_unlock(&cifs_tcp_ses_lock);
/* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
@@ -936,7 +946,7 @@ cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter)
ssize_t rc;
struct inode *inode = file_inode(iocb->ki_filp);
- if (iocb->ki_filp->f_flags & O_DIRECT)
+ if (iocb->ki_flags & IOCB_DIRECT)
return cifs_user_readv(iocb, iter);
rc = cifs_revalidate_mapping(inode);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 15a5c5db038b..c0542bdcd06b 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -153,5 +153,5 @@ extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
#define SMB3_PRODUCT_BUILD 35
-#define CIFS_VERSION "2.35"
+#define CIFS_VERSION "2.36"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 48b343d03430..8de977c359b1 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -16,6 +16,7 @@
#include <linux/mempool.h>
#include <linux/workqueue.h>
#include <linux/utsname.h>
+#include <linux/netfs.h>
#include "cifs_fs_sb.h"
#include "cifsacl.h"
#include <crypto/internal/hash.h>
@@ -115,10 +116,18 @@ enum statusEnum {
CifsInNegotiate,
CifsNeedSessSetup,
CifsInSessSetup,
- CifsNeedTcon,
- CifsInTcon,
- CifsNeedFilesInvalidate,
- CifsInFilesInvalidate
+};
+
+/* associated with each tree connection to the server */
+enum tid_status_enum {
+ TID_NEW = 0,
+ TID_GOOD,
+ TID_EXITING,
+ TID_NEED_RECON,
+ TID_NEED_TCON,
+ TID_IN_TCON,
+ TID_NEED_FILES_INVALIDATE, /* currently unused */
+ TID_IN_FILES_INVALIDATE
};
enum securityEnum {
@@ -852,13 +861,7 @@ compare_mid(__u16 mid, const struct smb_hdr *smb)
#define CIFS_MAX_RFC1002_WSIZE ((1<<17) - 1 - sizeof(WRITE_REQ) + 4)
#define CIFS_MAX_RFC1002_RSIZE ((1<<17) - 1 - sizeof(READ_RSP) + 4)
-/*
- * The default wsize is 1M. find_get_pages seems to return a maximum of 256
- * pages in a single call. With PAGE_SIZE == 4k, this means we can fill
- * a single wsize request with a single call.
- */
#define CIFS_DEFAULT_IOSIZE (1024 * 1024)
-#define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024)
/*
* Windows only supports a max of 60kb reads and 65535 byte writes. Default to
@@ -1038,7 +1041,7 @@ struct cifs_tcon {
char *password; /* for share-level security */
__u32 tid; /* The 4 byte tree id */
__u16 Flags; /* optional support bits */
- enum statusEnum tidStatus;
+ enum tid_status_enum status;
atomic_t num_smbs_sent;
union {
struct {
@@ -1402,6 +1405,11 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
*/
struct cifsInodeInfo {
+ struct {
+ /* These must be contiguous */
+ struct inode vfs_inode; /* the VFS's inode record */
+ struct netfs_i_context netfs_ctx; /* Netfslib context */
+ };
bool can_cache_brlcks;
struct list_head llist; /* locks helb by this inode */
/*
@@ -1432,10 +1440,6 @@ struct cifsInodeInfo {
u64 uniqueid; /* server inode number */
u64 createtime; /* creation time on server */
__u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for this inode */
-#ifdef CONFIG_CIFS_FSCACHE
- struct fscache_cookie *fscache;
-#endif
- struct inode vfs_inode;
struct list_head deferred_closes; /* list of deferred closes */
spinlock_t deferred_lock; /* protection on deferred list */
bool lease_granted; /* Flag to indicate whether lease or oplock is granted. */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 68b9a436af4b..aeba371c4c70 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -123,18 +123,6 @@
*/
#define CIFS_SESS_KEY_SIZE (16)
-/*
- * Size of the smb3 signing key
- */
-#define SMB3_SIGN_KEY_SIZE (16)
-
-/*
- * Size of the smb3 encryption/decryption key storage.
- * This size is big enough to store any cipher key types.
- */
-#define SMB3_ENC_DEC_KEY_SIZE (32)
-
-#define CIFS_CLIENT_CHALLENGE_SIZE (8)
#define CIFS_SERVER_CHALLENGE_SIZE (8)
#define CIFS_HMAC_MD5_HASH_SIZE (16)
#define CIFS_CPHTXT_SIZE (16)
@@ -1658,7 +1646,7 @@ struct smb_t2_rsp {
#define SMB_FIND_FILE_ID_FULL_DIR_INFO 0x105
#define SMB_FIND_FILE_ID_BOTH_DIR_INFO 0x106
#define SMB_FIND_FILE_UNIX 0x202
-#define SMB_FIND_FILE_POSIX_INFO 0x064
+/* #define SMB_FIND_FILE_POSIX_INFO 0x064 */
typedef struct smb_com_transaction2_qpi_req {
struct smb_hdr hdr; /* wct = 14+ */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index d3701295402d..0df3b24a0bf4 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -132,6 +132,9 @@ extern int SendReceiveBlockingLock(const unsigned int xid,
struct smb_hdr *out_buf,
int *bytes_returned);
void
+cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server,
+ bool all_channels);
+void
cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
bool mark_smb_session);
extern int cifs_reconnect(struct TCP_Server_Info *server,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 071e2f21a7db..47e927c4ff8d 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -75,12 +75,11 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon)
/* only send once per connect */
spin_lock(&cifs_tcp_ses_lock);
- if (tcon->ses->status != CifsGood ||
- tcon->tidStatus != CifsNeedReconnect) {
+ if ((tcon->ses->status != CifsGood) || (tcon->status != TID_NEED_RECON)) {
spin_unlock(&cifs_tcp_ses_lock);
return;
}
- tcon->tidStatus = CifsInFilesInvalidate;
+ tcon->status = TID_IN_FILES_INVALIDATE;
spin_unlock(&cifs_tcp_ses_lock);
/* list all files open on tree connection and mark them invalid */
@@ -100,8 +99,8 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon)
mutex_unlock(&tcon->crfid.fid_mutex);
spin_lock(&cifs_tcp_ses_lock);
- if (tcon->tidStatus == CifsInFilesInvalidate)
- tcon->tidStatus = CifsNeedTcon;
+ if (tcon->status == TID_IN_FILES_INVALIDATE)
+ tcon->status = TID_NEED_TCON;
spin_unlock(&cifs_tcp_ses_lock);
/*
@@ -136,7 +135,7 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
* have tcon) are allowed as we start force umount
*/
spin_lock(&cifs_tcp_ses_lock);
- if (tcon->tidStatus == CifsExiting) {
+ if (tcon->status == TID_EXITING) {
if (smb_command != SMB_COM_WRITE_ANDX &&
smb_command != SMB_COM_OPEN_ANDX &&
smb_command != SMB_COM_TREE_DISCONNECT) {
@@ -597,7 +596,7 @@ CIFSSMBNegotiate(const unsigned int xid,
set_credits(server, server->maxReq);
/* probably no need to store and check maxvcs */
server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize);
- /* set up max_read for readpages check */
+ /* set up max_read for readahead check */
server->max_read = server->maxBuf;
server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
cifs_dbg(NOISY, "Max buf = %d\n", ses->server->maxBuf);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 053cb449eb16..42e14f408856 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -163,10 +163,50 @@ static void cifs_resolve_server(struct work_struct *work)
}
/*
+ * Update the tcpStatus for the server.
+ * This is used to signal the cifsd thread to call cifs_reconnect
+ * ONLY cifsd thread should call cifs_reconnect. For any other
+ * thread, use this function
+ *
+ * @server: the tcp ses for which reconnect is needed
+ * @all_channels: if this needs to be done for all channels
+ */
+void
+cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server,
+ bool all_channels)
+{
+ struct TCP_Server_Info *pserver;
+ struct cifs_ses *ses;
+ int i;
+
+ /* If server is a channel, select the primary channel */
+ pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
+
+ spin_lock(&cifs_tcp_ses_lock);
+ if (!all_channels) {
+ pserver->tcpStatus = CifsNeedReconnect;
+ spin_unlock(&cifs_tcp_ses_lock);
+ return;
+ }
+
+ list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
+ spin_lock(&ses->chan_lock);
+ for (i = 0; i < ses->chan_count; i++)
+ ses->chans[i].server->tcpStatus = CifsNeedReconnect;
+ spin_unlock(&ses->chan_lock);
+ }
+ spin_unlock(&cifs_tcp_ses_lock);
+}
+
+/*
* Mark all sessions and tcons for reconnect.
+ * IMPORTANT: make sure that this gets called only from
+ * cifsd thread. For any other thread, use
+ * cifs_signal_cifsd_for_reconnect
*
+ * @server: the tcp ses for which reconnect is needed
* @server needs to be previously set to CifsNeedReconnect.
- *
+ * @mark_smb_session: whether even sessions need to be marked
*/
void
cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
@@ -205,7 +245,7 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
tcon->need_reconnect = true;
- tcon->tidStatus = CifsNeedReconnect;
+ tcon->status = TID_NEED_RECON;
}
if (ses->tcon_ipc)
ses->tcon_ipc->need_reconnect = true;
@@ -413,9 +453,7 @@ static int reconnect_target_unlocked(struct TCP_Server_Info *server, struct dfs_
return rc;
}
-static int
-reconnect_dfs_server(struct TCP_Server_Info *server,
- bool mark_smb_session)
+static int reconnect_dfs_server(struct TCP_Server_Info *server)
{
int rc = 0;
const char *refpath = server->current_fullpath + 1;
@@ -439,7 +477,12 @@ reconnect_dfs_server(struct TCP_Server_Info *server,
if (!cifs_tcp_ses_needs_reconnect(server, num_targets))
return 0;
- cifs_mark_tcp_ses_conns_for_reconnect(server, mark_smb_session);
+ /*
+ * Unconditionally mark all sessions & tcons for reconnect as we might be connecting to a
+ * different server or share during failover. It could be improved by adding some logic to
+ * only do that in case it connects to a different server or share, though.
+ */
+ cifs_mark_tcp_ses_conns_for_reconnect(server, true);
cifs_abort_connection(server);
@@ -491,13 +534,20 @@ int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session)
{
/* If tcp session is not an dfs connection, then reconnect to last target server */
spin_lock(&cifs_tcp_ses_lock);
- if (!server->is_dfs_conn || !server->origin_fullpath || !server->leaf_fullpath) {
+ if (!server->is_dfs_conn) {
spin_unlock(&cifs_tcp_ses_lock);
return __cifs_reconnect(server, mark_smb_session);
}
spin_unlock(&cifs_tcp_ses_lock);
- return reconnect_dfs_server(server, mark_smb_session);
+ mutex_lock(&server->refpath_lock);
+ if (!server->origin_fullpath || !server->leaf_fullpath) {
+ mutex_unlock(&server->refpath_lock);
+ return __cifs_reconnect(server, mark_smb_session);
+ }
+ mutex_unlock(&server->refpath_lock);
+
+ return reconnect_dfs_server(server);
}
#else
int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session)
@@ -1006,7 +1056,7 @@ smb2_add_credits_from_hdr(char *buffer, struct TCP_Server_Info *server)
spin_unlock(&server->req_lock);
wake_up(&server->request_q);
- trace_smb3_add_credits(server->CurrentMid,
+ trace_smb3_hdr_credits(server->CurrentMid,
server->conn_id, server->hostname, scredits,
le16_to_cpu(shdr->CreditRequest), in_flight);
cifs_server_dbg(FYI, "%s: added %u credits total=%d\n",
@@ -2167,7 +2217,7 @@ get_ses_fail:
static int match_tcon(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
{
- if (tcon->tidStatus == CifsExiting)
+ if (tcon->status == TID_EXITING)
return 0;
if (strncmp(tcon->treeName, ctx->UNC, MAX_TREE_SIZE))
return 0;
@@ -3473,6 +3523,9 @@ static int connect_dfs_target(struct mount_ctx *mnt_ctx, const char *full_path,
struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
char *oldmnt = cifs_sb->ctx->mount_options;
+ cifs_dbg(FYI, "%s: full_path=%s ref_path=%s target=%s\n", __func__, full_path, ref_path,
+ dfs_cache_get_tgt_name(tit));
+
rc = dfs_cache_get_tgt_referral(ref_path, tit, &ref);
if (rc)
goto out;
@@ -3571,13 +3624,18 @@ static int __follow_dfs_link(struct mount_ctx *mnt_ctx)
if (rc)
goto out;
- /* Try all dfs link targets */
+ /* Try all dfs link targets. If an I/O fails from currently connected DFS target with an
+ * error other than STATUS_PATH_NOT_COVERED (-EREMOTE), then retry it from other targets as
+ * specified in MS-DFSC "3.1.5.2 I/O Operation to Target Fails with an Error Other Than
+ * STATUS_PATH_NOT_COVERED."
+ */
for (rc = -ENOENT, tit = dfs_cache_get_tgt_iterator(&tl);
tit; tit = dfs_cache_get_next_tgt(&tl, tit)) {
rc = connect_dfs_target(mnt_ctx, full_path, mnt_ctx->leaf_fullpath + 1, tit);
if (!rc) {
rc = is_path_remote(mnt_ctx);
- break;
+ if (!rc || rc == -EREMOTE)
+ break;
}
}
@@ -3624,9 +3682,11 @@ static void setup_server_referral_paths(struct mount_ctx *mnt_ctx)
{
struct TCP_Server_Info *server = mnt_ctx->server;
+ mutex_lock(&server->refpath_lock);
server->origin_fullpath = mnt_ctx->origin_fullpath;
server->leaf_fullpath = mnt_ctx->leaf_fullpath;
server->current_fullpath = mnt_ctx->leaf_fullpath;
+ mutex_unlock(&server->refpath_lock);
mnt_ctx->origin_fullpath = mnt_ctx->leaf_fullpath = NULL;
}
@@ -3651,7 +3711,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
goto error;
rc = is_path_remote(&mnt_ctx);
- if (rc == -EREMOTE)
+ if (rc)
rc = follow_dfs_link(&mnt_ctx);
if (rc)
goto error;
@@ -3924,7 +3984,8 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
/* only send once per connect */
spin_lock(&cifs_tcp_ses_lock);
- if (server->tcpStatus != CifsNeedSessSetup) {
+ if ((server->tcpStatus != CifsNeedSessSetup) &&
+ (ses->status == CifsGood)) {
spin_unlock(&cifs_tcp_ses_lock);
return 0;
}
@@ -4416,7 +4477,7 @@ static int tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *tco
*/
if (rc && server->current_fullpath != server->origin_fullpath) {
server->current_fullpath = server->origin_fullpath;
- cifs_reconnect(tcon->ses->server, true);
+ cifs_signal_cifsd_for_reconnect(server, true);
}
dfs_cache_free_tgts(tl);
@@ -4437,12 +4498,12 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru
/* only send once per connect */
spin_lock(&cifs_tcp_ses_lock);
if (tcon->ses->status != CifsGood ||
- (tcon->tidStatus != CifsNew &&
- tcon->tidStatus != CifsNeedTcon)) {
+ (tcon->status != TID_NEW &&
+ tcon->status != TID_NEED_TCON)) {
spin_unlock(&cifs_tcp_ses_lock);
return 0;
}
- tcon->tidStatus = CifsInTcon;
+ tcon->status = TID_IN_TCON;
spin_unlock(&cifs_tcp_ses_lock);
tree = kzalloc(MAX_TREE_SIZE, GFP_KERNEL);
@@ -4483,13 +4544,13 @@ out:
if (rc) {
spin_lock(&cifs_tcp_ses_lock);
- if (tcon->tidStatus == CifsInTcon)
- tcon->tidStatus = CifsNeedTcon;
+ if (tcon->status == TID_IN_TCON)
+ tcon->status = TID_NEED_TCON;
spin_unlock(&cifs_tcp_ses_lock);
} else {
spin_lock(&cifs_tcp_ses_lock);
- if (tcon->tidStatus == CifsInTcon)
- tcon->tidStatus = CifsGood;
+ if (tcon->status == TID_IN_TCON)
+ tcon->status = TID_GOOD;
spin_unlock(&cifs_tcp_ses_lock);
tcon->need_reconnect = false;
}
@@ -4505,24 +4566,24 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru
/* only send once per connect */
spin_lock(&cifs_tcp_ses_lock);
if (tcon->ses->status != CifsGood ||
- (tcon->tidStatus != CifsNew &&
- tcon->tidStatus != CifsNeedTcon)) {
+ (tcon->status != TID_NEW &&
+ tcon->status != TID_NEED_TCON)) {
spin_unlock(&cifs_tcp_ses_lock);
return 0;
}
- tcon->tidStatus = CifsInTcon;
+ tcon->status = TID_IN_TCON;
spin_unlock(&cifs_tcp_ses_lock);
rc = ops->tree_connect(xid, tcon->ses, tcon->treeName, tcon, nlsc);
if (rc) {
spin_lock(&cifs_tcp_ses_lock);
- if (tcon->tidStatus == CifsInTcon)
- tcon->tidStatus = CifsNeedTcon;
+ if (tcon->status == TID_IN_TCON)
+ tcon->status = TID_NEED_TCON;
spin_unlock(&cifs_tcp_ses_lock);
} else {
spin_lock(&cifs_tcp_ses_lock);
- if (tcon->tidStatus == CifsInTcon)
- tcon->tidStatus = CifsGood;
+ if (tcon->status == TID_IN_TCON)
+ tcon->status = TID_GOOD;
spin_unlock(&cifs_tcp_ses_lock);
tcon->need_reconnect = false;
}
diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
index 831f42458bf6..956f8e5cf3e7 100644
--- a/fs/cifs/dfs_cache.c
+++ b/fs/cifs/dfs_cache.c
@@ -1355,7 +1355,7 @@ static void mark_for_reconnect_if_needed(struct cifs_tcon *tcon, struct dfs_cach
}
cifs_dbg(FYI, "%s: no cached or matched targets. mark dfs share for reconnect.\n", __func__);
- cifs_mark_tcp_ses_conns_for_reconnect(tcon->ses->server, true);
+ cifs_signal_cifsd_for_reconnect(tcon->ses->server, true);
}
/* Refresh dfs referral of tcon and mark it for reconnect if needed */
@@ -1422,12 +1422,14 @@ static int refresh_tcon(struct cifs_ses **sessions, struct cifs_tcon *tcon, bool
struct TCP_Server_Info *server = tcon->ses->server;
mutex_lock(&server->refpath_lock);
- if (strcasecmp(server->leaf_fullpath, server->origin_fullpath))
- __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, force_refresh);
+ if (server->origin_fullpath) {
+ if (server->leaf_fullpath && strcasecmp(server->leaf_fullpath,
+ server->origin_fullpath))
+ __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, force_refresh);
+ __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, force_refresh);
+ }
mutex_unlock(&server->refpath_lock);
- __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, force_refresh);
-
return 0;
}
@@ -1530,11 +1532,14 @@ static void refresh_mounts(struct cifs_ses **sessions)
list_del_init(&tcon->ulist);
mutex_lock(&server->refpath_lock);
- if (strcasecmp(server->leaf_fullpath, server->origin_fullpath))
- __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, false);
+ if (server->origin_fullpath) {
+ if (server->leaf_fullpath && strcasecmp(server->leaf_fullpath,
+ server->origin_fullpath))
+ __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, false);
+ __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, false);
+ }
mutex_unlock(&server->refpath_lock);
- __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, false);
cifs_put_tcon(tcon);
}
}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e7af802dcfa6..d511a78383c3 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3740,6 +3740,11 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
break;
}
+ if (cifs_sb->ctx->rsize == 0)
+ cifs_sb->ctx->rsize =
+ server->ops->negotiate_rsize(tlink_tcon(open_file->tlink),
+ cifs_sb->ctx);
+
rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize,
&rsize, credits);
if (rc)
@@ -4205,13 +4210,19 @@ cifs_page_mkwrite(struct vm_fault *vmf)
{
struct page *page = vmf->page;
+ /* Wait for the page to be written to the cache before we allow it to
+ * be modified. We then assume the entire page will need writing back.
+ */
#ifdef CONFIG_CIFS_FSCACHE
if (PageFsCache(page) &&
wait_on_page_fscache_killable(page) < 0)
return VM_FAULT_RETRY;
#endif
- lock_page(page);
+ wait_on_page_writeback(page);
+
+ if (lock_page_killable(page) < 0)
+ return VM_FAULT_RETRY;
return VM_FAULT_LOCKED;
}
@@ -4474,6 +4485,11 @@ static void cifs_readahead(struct readahead_control *ractl)
}
}
+ if (cifs_sb->ctx->rsize == 0)
+ cifs_sb->ctx->rsize =
+ server->ops->negotiate_rsize(tlink_tcon(open_file->tlink),
+ cifs_sb->ctx);
+
rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize,
&rsize, credits);
if (rc)
@@ -4754,17 +4770,17 @@ static int cifs_release_page(struct page *page, gfp_t gfp)
return true;
}
-static void cifs_invalidate_page(struct page *page, unsigned int offset,
- unsigned int length)
+static void cifs_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- wait_on_page_fscache(page);
+ folio_wait_fscache(folio);
}
-static int cifs_launder_page(struct page *page)
+static int cifs_launder_folio(struct folio *folio)
{
int rc = 0;
- loff_t range_start = page_offset(page);
- loff_t range_end = range_start + (loff_t)(PAGE_SIZE - 1);
+ loff_t range_start = folio_pos(folio);
+ loff_t range_end = range_start + folio_size(folio);
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = 0,
@@ -4772,12 +4788,12 @@ static int cifs_launder_page(struct page *page)
.range_end = range_end,
};
- cifs_dbg(FYI, "Launder page: %p\n", page);
+ cifs_dbg(FYI, "Launder page: %lu\n", folio->index);
- if (clear_page_dirty_for_io(page))
- rc = cifs_writepage_locked(page, &wbc);
+ if (folio_clear_dirty_for_io(folio))
+ rc = cifs_writepage_locked(&folio->page, &wbc);
- wait_on_page_fscache(page);
+ folio_wait_fscache(folio);
return rc;
}
@@ -4939,12 +4955,13 @@ static void cifs_swap_deactivate(struct file *file)
* need to pin the cache object to write back to.
*/
#ifdef CONFIG_CIFS_FSCACHE
-static int cifs_set_page_dirty(struct page *page)
+static bool cifs_dirty_folio(struct address_space *mapping, struct folio *folio)
{
- return fscache_set_page_dirty(page, cifs_inode_cookie(page->mapping->host));
+ return fscache_dirty_folio(mapping, folio,
+ cifs_inode_cookie(mapping->host));
}
#else
-#define cifs_set_page_dirty __set_page_dirty_nobuffers
+#define cifs_dirty_folio filemap_dirty_folio
#endif
const struct address_space_operations cifs_addr_ops = {
@@ -4954,11 +4971,11 @@ const struct address_space_operations cifs_addr_ops = {
.writepages = cifs_writepages,
.write_begin = cifs_write_begin,
.write_end = cifs_write_end,
- .set_page_dirty = cifs_set_page_dirty,
+ .dirty_folio = cifs_dirty_folio,
.releasepage = cifs_release_page,
.direct_IO = cifs_direct_io,
- .invalidatepage = cifs_invalidate_page,
- .launder_page = cifs_launder_page,
+ .invalidate_folio = cifs_invalidate_folio,
+ .launder_folio = cifs_launder_folio,
/*
* TODO: investigate and if useful we could add an cifs_migratePage
* helper (under an CONFIG_MIGRATION) in the future, and also
@@ -4979,8 +4996,8 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
.writepages = cifs_writepages,
.write_begin = cifs_write_begin,
.write_end = cifs_write_end,
- .set_page_dirty = cifs_set_page_dirty,
+ .dirty_folio = cifs_dirty_folio,
.releasepage = cifs_release_page,
- .invalidatepage = cifs_invalidate_page,
- .launder_page = cifs_launder_page,
+ .invalidate_folio = cifs_invalidate_folio,
+ .launder_folio = cifs_launder_folio,
};
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index 33af72e0ac0c..a638b29e9062 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -103,7 +103,7 @@ void cifs_fscache_get_inode_cookie(struct inode *inode)
cifs_fscache_fill_coherency(&cifsi->vfs_inode, &cd);
- cifsi->fscache =
+ cifsi->netfs_ctx.cache =
fscache_acquire_cookie(tcon->fscache, 0,
&cifsi->uniqueid, sizeof(cifsi->uniqueid),
&cd, sizeof(cd),
@@ -126,22 +126,15 @@ void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update)
void cifs_fscache_release_inode_cookie(struct inode *inode)
{
struct cifsInodeInfo *cifsi = CIFS_I(inode);
+ struct fscache_cookie *cookie = cifs_inode_cookie(inode);
- if (cifsi->fscache) {
- cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache);
- fscache_relinquish_cookie(cifsi->fscache, false);
- cifsi->fscache = NULL;
+ if (cookie) {
+ cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cookie);
+ fscache_relinquish_cookie(cookie, false);
+ cifsi->netfs_ctx.cache = NULL;
}
}
-static inline void fscache_end_operation(struct netfs_cache_resources *cres)
-{
- const struct netfs_cache_ops *ops = fscache_operation_valid(cres);
-
- if (ops)
- ops->end_operation(cres);
-}
-
/*
* Fallback page reading interface.
*/
diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h
index 55129908e2c1..52355c0912ae 100644
--- a/fs/cifs/fscache.h
+++ b/fs/cifs/fscache.h
@@ -61,7 +61,7 @@ void cifs_fscache_fill_coherency(struct inode *inode,
static inline struct fscache_cookie *cifs_inode_cookie(struct inode *inode)
{
- return CIFS_I(inode)->fscache;
+ return netfs_i_cookie(inode);
}
static inline void cifs_invalidate_cache(struct inode *inode, unsigned int flags)
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 60d853c92f6a..2f9e7d2f81b6 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -49,7 +49,7 @@ static void cifs_set_ops(struct inode *inode)
inode->i_fop = &cifs_file_ops;
}
- /* check if server can support readpages */
+ /* check if server can support readahead */
if (cifs_sb_master_tcon(cifs_sb)->ses->server->max_read <
PAGE_SIZE + MAX_CIFS_HDR_SIZE)
inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 852e54ee82c2..bbdf3281559c 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -85,6 +85,9 @@ parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len,
if (rc != 1)
return -EINVAL;
+ if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN)
+ return -EINVAL;
+
rc = symlink_hash(link_len, link_str, md5_hash);
if (rc) {
cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 56598f7dbe00..afaf59c22193 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -116,7 +116,7 @@ tconInfoAlloc(void)
}
atomic_inc(&tconInfoAllocCount);
- ret_buf->tidStatus = CifsNew;
+ ret_buf->status = TID_NEW;
++ret_buf->tc_count;
INIT_LIST_HEAD(&ret_buf->openFileList);
INIT_LIST_HEAD(&ret_buf->tcon_list);
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index ebe236b9d9f5..235aa1b395eb 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -896,7 +896,7 @@ map_and_check_smb_error(struct mid_q_entry *mid, bool logErr)
if (class == ERRSRV && code == ERRbaduid) {
cifs_dbg(FYI, "Server returned 0x%x, reconnecting session...\n",
code);
- cifs_reconnect(mid->server, false);
+ cifs_signal_cifsd_for_reconnect(mid->server, false);
}
}
diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h
index 298458404252..55758b9ec877 100644
--- a/fs/cifs/ntlmssp.h
+++ b/fs/cifs/ntlmssp.h
@@ -107,7 +107,7 @@ struct negotiate_message {
SECURITY_BUFFER WorkstationName; /* RFC 1001 and ASCII */
struct ntlmssp_version Version;
/* SECURITY_BUFFER */
- char DomainString[0];
+ char DomainString[];
/* followed by WorkstationString */
} __packed;
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index b2fb7bd11936..c71c9a44bef4 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -228,7 +228,7 @@ cifs_get_next_mid(struct TCP_Server_Info *server)
spin_unlock(&GlobalMid_Lock);
if (reconnect) {
- cifs_mark_tcp_ses_conns_for_reconnect(server, false);
+ cifs_signal_cifsd_for_reconnect(server, false);
}
return mid;
diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h
index 4125fd113cfb..82e916ad167c 100644
--- a/fs/cifs/smb2glob.h
+++ b/fs/cifs/smb2glob.h
@@ -41,15 +41,4 @@
#define END_OF_CHAIN 4
#define RELATED_REQUEST 8
-#define SMB2_SIGNATURE_SIZE (16)
-#define SMB2_NTLMV2_SESSKEY_SIZE (16)
-#define SMB2_HMACSHA256_SIZE (32)
-#define SMB2_CMACAES_SIZE (16)
-#define SMB3_SIGNKEY_SIZE (16)
-#define SMB3_GCM128_CRYPTKEY_SIZE (16)
-#define SMB3_GCM256_CRYPTKEY_SIZE (32)
-
-/* Maximum buffer size value we can send with 1 credit */
-#define SMB2_MAX_BUFFER_SIZE 65536
-
#endif /* _SMB2_GLOB_H */
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index b25623e3fe3d..3fe47a88f47d 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -150,16 +150,18 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr)
struct smb2_transform_hdr *thdr =
(struct smb2_transform_hdr *)buf;
struct cifs_ses *ses = NULL;
+ struct cifs_ses *iter;
/* decrypt frame now that it is completely read in */
spin_lock(&cifs_tcp_ses_lock);
- list_for_each_entry(ses, &srvr->smb_ses_list, smb_ses_list) {
- if (ses->Suid == le64_to_cpu(thdr->SessionId))
+ list_for_each_entry(iter, &srvr->smb_ses_list, smb_ses_list) {
+ if (iter->Suid == le64_to_cpu(thdr->SessionId)) {
+ ses = iter;
break;
+ }
}
spin_unlock(&cifs_tcp_ses_lock);
- if (list_entry_is_head(ses, &srvr->smb_ses_list,
- smb_ses_list)) {
+ if (!ses) {
cifs_dbg(VFS, "no decryption - session id not found\n");
return 1;
}
@@ -203,7 +205,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr)
if (smb2_rsp_struct_sizes[command] != pdu->StructureSize2) {
if (command != SMB2_OPLOCK_BREAK_HE && (shdr->Status == 0 ||
- pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2)) {
+ pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2_LE)) {
/* error packets have 9 byte structure size */
cifs_dbg(VFS, "Invalid response size %u for command %d\n",
le16_to_cpu(pdu->StructureSize2), command);
@@ -303,7 +305,7 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *shdr)
/* error responses do not have data area */
if (shdr->Status && shdr->Status != STATUS_MORE_PROCESSING_REQUIRED &&
(((struct smb2_err_rsp *)shdr)->StructureSize) ==
- SMB2_ERROR_STRUCTURE_SIZE2)
+ SMB2_ERROR_STRUCTURE_SIZE2_LE)
return NULL;
/*
@@ -478,11 +480,11 @@ smb2_get_lease_state(struct cifsInodeInfo *cinode)
__le32 lease = 0;
if (CIFS_CACHE_WRITE(cinode))
- lease |= SMB2_LEASE_WRITE_CACHING;
+ lease |= SMB2_LEASE_WRITE_CACHING_LE;
if (CIFS_CACHE_HANDLE(cinode))
- lease |= SMB2_LEASE_HANDLE_CACHING;
+ lease |= SMB2_LEASE_HANDLE_CACHING_LE;
if (CIFS_CACHE_READ(cinode))
- lease |= SMB2_LEASE_READ_CACHING;
+ lease |= SMB2_LEASE_READ_CACHING_LE;
return lease;
}
@@ -832,8 +834,8 @@ smb2_handle_cancelled_mid(struct mid_q_entry *mid, struct TCP_Server_Info *serve
rc = __smb2_handle_cancelled_cmd(tcon,
le16_to_cpu(hdr->Command),
le64_to_cpu(hdr->MessageId),
- le64_to_cpu(rsp->PersistentFileId),
- le64_to_cpu(rsp->VolatileFileId));
+ rsp->PersistentFileId,
+ rsp->VolatileFileId);
if (rc)
cifs_put_tcon(tcon);
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index af5d0830bc8a..d6aaeff4a30a 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -25,6 +25,7 @@
#include "smb2glob.h"
#include "cifs_ioctl.h"
#include "smbdirect.h"
+#include "fscache.h"
#include "fs_context.h"
/* Change credits for different ops and return the total number of credits */
@@ -85,6 +86,9 @@ smb2_add_credits(struct TCP_Server_Info *server,
if (*val > 65000) {
*val = 65000; /* Don't get near 64K credits, avoid srv bugs */
pr_warn_once("server overflowed SMB3 credits\n");
+ trace_smb3_overflow_credits(server->CurrentMid,
+ server->conn_id, server->hostname, *val,
+ add, server->in_flight);
}
server->in_flight--;
if (server->in_flight == 0 &&
@@ -250,7 +254,7 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
in_flight = server->in_flight;
spin_unlock(&server->req_lock);
- trace_smb3_add_credits(server->CurrentMid,
+ trace_smb3_wait_credits(server->CurrentMid,
server->conn_id, server->hostname, scredits, -(credits->value), in_flight);
cifs_dbg(FYI, "%s: removed %u credits total=%d\n",
__func__, credits->value, scredits);
@@ -299,7 +303,7 @@ smb2_adjust_credits(struct TCP_Server_Info *server,
spin_unlock(&server->req_lock);
wake_up(&server->request_q);
- trace_smb3_add_credits(server->CurrentMid,
+ trace_smb3_adj_credits(server->CurrentMid,
server->conn_id, server->hostname, scredits,
credits->value - new_val, in_flight);
cifs_dbg(FYI, "%s: adjust added %u credits total=%d\n",
@@ -896,8 +900,8 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
atomic_inc(&tcon->num_remote_opens);
o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base;
- oparms.fid->persistent_fid = le64_to_cpu(o_rsp->PersistentFileId);
- oparms.fid->volatile_fid = le64_to_cpu(o_rsp->VolatileFileId);
+ oparms.fid->persistent_fid = o_rsp->PersistentFileId;
+ oparms.fid->volatile_fid = o_rsp->VolatileFileId;
#ifdef CONFIG_CIFS_DEBUG2
oparms.fid->mid = le64_to_cpu(o_rsp->hdr.MessageId);
#endif /* CIFS_DEBUG2 */
@@ -1191,17 +1195,12 @@ smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb)
{
int rc;
- __le16 *utf16_path;
struct kvec rsp_iov = {NULL, 0};
int buftype = CIFS_NO_BUFFER;
struct smb2_query_info_rsp *rsp;
struct smb2_file_full_ea_info *info = NULL;
- utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
- if (!utf16_path)
- return -ENOMEM;
-
- rc = smb2_query_info_compound(xid, tcon, utf16_path,
+ rc = smb2_query_info_compound(xid, tcon, path,
FILE_READ_EA,
FILE_FULL_EA_INFORMATION,
SMB2_O_INFO_FILE,
@@ -1234,7 +1233,6 @@ smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon,
le32_to_cpu(rsp->OutputBufferLength), ea_name);
qeas_exit:
- kfree(utf16_path);
free_rsp_buf(buftype, rsp_iov.iov_base);
return rc;
}
@@ -1294,7 +1292,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
* the new EA. If not we should not add it since we
* would not be able to even read the EAs back.
*/
- rc = smb2_query_info_compound(xid, tcon, utf16_path,
+ rc = smb2_query_info_compound(xid, tcon, path,
FILE_READ_EA,
FILE_FULL_EA_INFORMATION,
SMB2_O_INFO_FILE,
@@ -1642,6 +1640,7 @@ smb2_ioctl_query_info(const unsigned int xid,
unsigned int size[2];
void *data[2];
int create_options = is_dir ? CREATE_NOT_FILE : CREATE_NOT_DIR;
+ void (*free_req1_func)(struct smb_rqst *r);
vars = kzalloc(sizeof(*vars), GFP_ATOMIC);
if (vars == NULL)
@@ -1651,27 +1650,29 @@ smb2_ioctl_query_info(const unsigned int xid,
resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
- if (copy_from_user(&qi, arg, sizeof(struct smb_query_info)))
- goto e_fault;
-
+ if (copy_from_user(&qi, arg, sizeof(struct smb_query_info))) {
+ rc = -EFAULT;
+ goto free_vars;
+ }
if (qi.output_buffer_length > 1024) {
- kfree(vars);
- return -EINVAL;
+ rc = -EINVAL;
+ goto free_vars;
}
if (!ses || !server) {
- kfree(vars);
- return -EIO;
+ rc = -EIO;
+ goto free_vars;
}
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
- buffer = memdup_user(arg + sizeof(struct smb_query_info),
- qi.output_buffer_length);
- if (IS_ERR(buffer)) {
- kfree(vars);
- return PTR_ERR(buffer);
+ if (qi.output_buffer_length) {
+ buffer = memdup_user(arg + sizeof(struct smb_query_info), qi.output_buffer_length);
+ if (IS_ERR(buffer)) {
+ rc = PTR_ERR(buffer);
+ goto free_vars;
+ }
}
/* Open */
@@ -1709,45 +1710,45 @@ smb2_ioctl_query_info(const unsigned int xid,
rc = SMB2_open_init(tcon, server,
&rqst[0], &oplock, &oparms, path);
if (rc)
- goto iqinf_exit;
+ goto free_output_buffer;
smb2_set_next_command(tcon, &rqst[0]);
/* Query */
if (qi.flags & PASSTHRU_FSCTL) {
/* Can eventually relax perm check since server enforces too */
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable(CAP_SYS_ADMIN)) {
rc = -EPERM;
- else {
- rqst[1].rq_iov = &vars->io_iov[0];
- rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE;
-
- rc = SMB2_ioctl_init(tcon, server,
- &rqst[1],
- COMPOUND_FID, COMPOUND_FID,
- qi.info_type, true, buffer,
- qi.output_buffer_length,
- CIFSMaxBufSize -
- MAX_SMB2_CREATE_RESPONSE_SIZE -
- MAX_SMB2_CLOSE_RESPONSE_SIZE);
+ goto free_open_req;
}
+ rqst[1].rq_iov = &vars->io_iov[0];
+ rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE;
+
+ rc = SMB2_ioctl_init(tcon, server, &rqst[1], COMPOUND_FID, COMPOUND_FID,
+ qi.info_type, true, buffer, qi.output_buffer_length,
+ CIFSMaxBufSize - MAX_SMB2_CREATE_RESPONSE_SIZE -
+ MAX_SMB2_CLOSE_RESPONSE_SIZE);
+ free_req1_func = SMB2_ioctl_free;
} else if (qi.flags == PASSTHRU_SET_INFO) {
/* Can eventually relax perm check since server enforces too */
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable(CAP_SYS_ADMIN)) {
rc = -EPERM;
- else {
- rqst[1].rq_iov = &vars->si_iov[0];
- rqst[1].rq_nvec = 1;
-
- size[0] = 8;
- data[0] = buffer;
-
- rc = SMB2_set_info_init(tcon, server,
- &rqst[1],
- COMPOUND_FID, COMPOUND_FID,
- current->tgid,
- FILE_END_OF_FILE_INFORMATION,
- SMB2_O_INFO_FILE, 0, data, size);
+ goto free_open_req;
+ }
+ if (qi.output_buffer_length < 8) {
+ rc = -EINVAL;
+ goto free_open_req;
}
+ rqst[1].rq_iov = &vars->si_iov[0];
+ rqst[1].rq_nvec = 1;
+
+ /* MS-FSCC 2.4.13 FileEndOfFileInformation */
+ size[0] = 8;
+ data[0] = buffer;
+
+ rc = SMB2_set_info_init(tcon, server, &rqst[1], COMPOUND_FID, COMPOUND_FID,
+ current->tgid, FILE_END_OF_FILE_INFORMATION,
+ SMB2_O_INFO_FILE, 0, data, size);
+ free_req1_func = SMB2_set_info_free;
} else if (qi.flags == PASSTHRU_QUERY_INFO) {
rqst[1].rq_iov = &vars->qi_iov[0];
rqst[1].rq_nvec = 1;
@@ -1758,6 +1759,7 @@ smb2_ioctl_query_info(const unsigned int xid,
qi.info_type, qi.additional_information,
qi.input_buffer_length,
qi.output_buffer_length, buffer);
+ free_req1_func = SMB2_query_info_free;
} else { /* unknown flags */
cifs_tcon_dbg(VFS, "Invalid passthru query flags: 0x%x\n",
qi.flags);
@@ -1765,7 +1767,7 @@ smb2_ioctl_query_info(const unsigned int xid,
}
if (rc)
- goto iqinf_exit;
+ goto free_open_req;
smb2_set_next_command(tcon, &rqst[1]);
smb2_set_related(&rqst[1]);
@@ -1776,14 +1778,14 @@ smb2_ioctl_query_info(const unsigned int xid,
rc = SMB2_close_init(tcon, server,
&rqst[2], COMPOUND_FID, COMPOUND_FID, false);
if (rc)
- goto iqinf_exit;
+ goto free_req_1;
smb2_set_related(&rqst[2]);
rc = compound_send_recv(xid, ses, server,
flags, 3, rqst,
resp_buftype, rsp_iov);
if (rc)
- goto iqinf_exit;
+ goto out;
/* No need to bump num_remote_opens since handle immediately closed */
if (qi.flags & PASSTHRU_FSCTL) {
@@ -1793,18 +1795,22 @@ smb2_ioctl_query_info(const unsigned int xid,
qi.input_buffer_length = le32_to_cpu(io_rsp->OutputCount);
if (qi.input_buffer_length > 0 &&
le32_to_cpu(io_rsp->OutputOffset) + qi.input_buffer_length
- > rsp_iov[1].iov_len)
- goto e_fault;
+ > rsp_iov[1].iov_len) {
+ rc = -EFAULT;
+ goto out;
+ }
if (copy_to_user(&pqi->input_buffer_length,
&qi.input_buffer_length,
- sizeof(qi.input_buffer_length)))
- goto e_fault;
+ sizeof(qi.input_buffer_length))) {
+ rc = -EFAULT;
+ goto out;
+ }
if (copy_to_user((void __user *)pqi + sizeof(struct smb_query_info),
(const void *)io_rsp + le32_to_cpu(io_rsp->OutputOffset),
qi.input_buffer_length))
- goto e_fault;
+ rc = -EFAULT;
} else {
pqi = (struct smb_query_info __user *)arg;
qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
@@ -1812,28 +1818,30 @@ smb2_ioctl_query_info(const unsigned int xid,
qi.input_buffer_length = le32_to_cpu(qi_rsp->OutputBufferLength);
if (copy_to_user(&pqi->input_buffer_length,
&qi.input_buffer_length,
- sizeof(qi.input_buffer_length)))
- goto e_fault;
+ sizeof(qi.input_buffer_length))) {
+ rc = -EFAULT;
+ goto out;
+ }
if (copy_to_user(pqi + 1, qi_rsp->Buffer,
qi.input_buffer_length))
- goto e_fault;
+ rc = -EFAULT;
}
- iqinf_exit:
- cifs_small_buf_release(rqst[0].rq_iov[0].iov_base);
- cifs_small_buf_release(rqst[1].rq_iov[0].iov_base);
- cifs_small_buf_release(rqst[2].rq_iov[0].iov_base);
+out:
free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
- kfree(vars);
+ SMB2_close_free(&rqst[2]);
+free_req_1:
+ free_req1_func(&rqst[1]);
+free_open_req:
+ SMB2_open_free(&rqst[0]);
+free_output_buffer:
kfree(buffer);
+free_vars:
+ kfree(vars);
return rc;
-
-e_fault:
- rc = -EFAULT;
- goto iqinf_exit;
}
static ssize_t
@@ -1850,9 +1858,17 @@ smb2_copychunk_range(const unsigned int xid,
int chunks_copied = 0;
bool chunk_sizes_updated = false;
ssize_t bytes_written, total_bytes_written = 0;
+ struct inode *inode;
pcchunk = kmalloc(sizeof(struct copychunk_ioctl), GFP_KERNEL);
+ /*
+ * We need to flush all unwritten data before we can send the
+ * copychunk ioctl to the server.
+ */
+ inode = d_inode(trgtfile->dentry);
+ filemap_write_and_wait(inode->i_mapping);
+
if (pcchunk == NULL)
return -ENOMEM;
@@ -2406,8 +2422,8 @@ again:
cifs_dbg(FYI, "query_dir_first: open failed rc=%d\n", rc);
goto qdf_free;
}
- fid->persistent_fid = le64_to_cpu(op_rsp->PersistentFileId);
- fid->volatile_fid = le64_to_cpu(op_rsp->VolatileFileId);
+ fid->persistent_fid = op_rsp->PersistentFileId;
+ fid->volatile_fid = op_rsp->VolatileFileId;
/* Anything else than ENODATA means a genuine error */
if (rc && rc != -ENODATA) {
@@ -2487,7 +2503,7 @@ smb2_is_status_pending(char *buf, struct TCP_Server_Info *server)
spin_unlock(&server->req_lock);
wake_up(&server->request_q);
- trace_smb3_add_credits(server->CurrentMid,
+ trace_smb3_pend_credits(server->CurrentMid,
server->conn_id, server->hostname, scredits,
le16_to_cpu(shdr->CreditRequest), in_flight);
cifs_dbg(FYI, "%s: status pending add %u credits total=%d\n",
@@ -2645,7 +2661,7 @@ smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst)
*/
int
smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
- __le16 *utf16_path, u32 desired_access,
+ const char *path, u32 desired_access,
u32 class, u32 type, u32 output_len,
struct kvec *rsp, int *buftype,
struct cifs_sb_info *cifs_sb)
@@ -2663,6 +2679,14 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_open_parms oparms;
struct cifs_fid fid;
int rc;
+ __le16 *utf16_path;
+ struct cached_fid *cfid = NULL;
+
+ if (!path)
+ path = "";
+ utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
+ if (!utf16_path)
+ return -ENOMEM;
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
@@ -2671,6 +2695,8 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
memset(rsp_iov, 0, sizeof(rsp_iov));
+ rc = open_cached_dir(xid, tcon, path, cifs_sb, &cfid);
+
memset(&open_iov, 0, sizeof(open_iov));
rqst[0].rq_iov = open_iov;
rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
@@ -2692,15 +2718,29 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
rqst[1].rq_iov = qi_iov;
rqst[1].rq_nvec = 1;
- rc = SMB2_query_info_init(tcon, server,
- &rqst[1], COMPOUND_FID, COMPOUND_FID,
- class, type, 0,
- output_len, 0,
- NULL);
+ if (cfid) {
+ rc = SMB2_query_info_init(tcon, server,
+ &rqst[1],
+ cfid->fid->persistent_fid,
+ cfid->fid->volatile_fid,
+ class, type, 0,
+ output_len, 0,
+ NULL);
+ } else {
+ rc = SMB2_query_info_init(tcon, server,
+ &rqst[1],
+ COMPOUND_FID,
+ COMPOUND_FID,
+ class, type, 0,
+ output_len, 0,
+ NULL);
+ }
if (rc)
goto qic_exit;
- smb2_set_next_command(tcon, &rqst[1]);
- smb2_set_related(&rqst[1]);
+ if (!cfid) {
+ smb2_set_next_command(tcon, &rqst[1]);
+ smb2_set_related(&rqst[1]);
+ }
memset(&close_iov, 0, sizeof(close_iov));
rqst[2].rq_iov = close_iov;
@@ -2712,9 +2752,15 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
goto qic_exit;
smb2_set_related(&rqst[2]);
- rc = compound_send_recv(xid, ses, server,
- flags, 3, rqst,
- resp_buftype, rsp_iov);
+ if (cfid) {
+ rc = compound_send_recv(xid, ses, server,
+ flags, 1, &rqst[1],
+ &resp_buftype[1], &rsp_iov[1]);
+ } else {
+ rc = compound_send_recv(xid, ses, server,
+ flags, 3, rqst,
+ resp_buftype, rsp_iov);
+ }
if (rc) {
free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
if (rc == -EREMCHG) {
@@ -2728,11 +2774,14 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
*buftype = resp_buftype[1];
qic_exit:
+ kfree(utf16_path);
SMB2_open_free(&rqst[0]);
SMB2_query_info_free(&rqst[1]);
SMB2_close_free(&rqst[2]);
free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
+ if (cfid)
+ close_cached_dir(cfid);
return rc;
}
@@ -2742,13 +2791,12 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
{
struct smb2_query_info_rsp *rsp;
struct smb2_fs_full_size_info *info = NULL;
- __le16 utf16_path = 0; /* Null - open root of share */
struct kvec rsp_iov = {NULL, 0};
int buftype = CIFS_NO_BUFFER;
int rc;
- rc = smb2_query_info_compound(xid, tcon, &utf16_path,
+ rc = smb2_query_info_compound(xid, tcon, "",
FILE_READ_ATTRIBUTES,
FS_FULL_SIZE_INFORMATION,
SMB2_O_INFO_FILESYSTEM,
@@ -3887,29 +3935,38 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon,
{
int rc;
unsigned int xid;
+ struct inode *inode;
struct cifsFileInfo *cfile = file->private_data;
+ struct cifsInodeInfo *cifsi;
__le64 eof;
xid = get_xid();
- if (off >= i_size_read(file->f_inode) ||
- off + len >= i_size_read(file->f_inode)) {
+ inode = d_inode(cfile->dentry);
+ cifsi = CIFS_I(inode);
+
+ if (off >= i_size_read(inode) ||
+ off + len >= i_size_read(inode)) {
rc = -EINVAL;
goto out;
}
rc = smb2_copychunk_range(xid, cfile, cfile, off + len,
- i_size_read(file->f_inode) - off - len, off);
+ i_size_read(inode) - off - len, off);
if (rc < 0)
goto out;
- eof = cpu_to_le64(i_size_read(file->f_inode) - len);
+ eof = cpu_to_le64(i_size_read(inode) - len);
rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid, cfile->pid, &eof);
if (rc < 0)
goto out;
rc = 0;
+
+ cifsi->server_eof = i_size_read(inode) - len;
+ truncate_setsize(inode, cifsi->server_eof);
+ fscache_resize_cookie(cifs_inode_cookie(inode), cifsi->server_eof);
out:
free_xid(xid);
return rc;
@@ -4283,12 +4340,12 @@ static __le32
map_oplock_to_lease(u8 oplock)
{
if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE)
- return SMB2_LEASE_WRITE_CACHING | SMB2_LEASE_READ_CACHING;
+ return SMB2_LEASE_WRITE_CACHING_LE | SMB2_LEASE_READ_CACHING_LE;
else if (oplock == SMB2_OPLOCK_LEVEL_II)
- return SMB2_LEASE_READ_CACHING;
+ return SMB2_LEASE_READ_CACHING_LE;
else if (oplock == SMB2_OPLOCK_LEVEL_BATCH)
- return SMB2_LEASE_HANDLE_CACHING | SMB2_LEASE_READ_CACHING |
- SMB2_LEASE_WRITE_CACHING;
+ return SMB2_LEASE_HANDLE_CACHING_LE | SMB2_LEASE_READ_CACHING_LE |
+ SMB2_LEASE_WRITE_CACHING_LE;
return 0;
}
@@ -4350,7 +4407,7 @@ smb2_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key)
struct create_lease *lc = (struct create_lease *)buf;
*epoch = 0; /* not used */
- if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS)
+ if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS_LE)
return SMB2_OPLOCK_LEVEL_NOCHANGE;
return le32_to_cpu(lc->lcontext.LeaseState);
}
@@ -4361,7 +4418,7 @@ smb3_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key)
struct create_lease_v2 *lc = (struct create_lease_v2 *)buf;
*epoch = le16_to_cpu(lc->lcontext.Epoch);
- if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS)
+ if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS_LE)
return SMB2_OPLOCK_LEVEL_NOCHANGE;
if (lease_key)
memcpy(lease_key, &lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE);
@@ -5804,8 +5861,8 @@ struct smb_version_values smb20_values = {
.protocol_id = SMB20_PROT_ID,
.req_capabilities = 0, /* MBZ */
.large_lock_type = 0,
- .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
- .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
+ .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE,
+ .shared_lock_type = SMB2_LOCKFLAG_SHARED,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
.header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
@@ -5825,8 +5882,8 @@ struct smb_version_values smb21_values = {
.protocol_id = SMB21_PROT_ID,
.req_capabilities = 0, /* MBZ on negotiate req until SMB3 dialect */
.large_lock_type = 0,
- .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
- .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
+ .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE,
+ .shared_lock_type = SMB2_LOCKFLAG_SHARED,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
.header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
@@ -5846,8 +5903,8 @@ struct smb_version_values smb3any_values = {
.protocol_id = SMB302_PROT_ID, /* doesn't matter, send protocol array */
.req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION | SMB2_GLOBAL_CAP_DIRECTORY_LEASING,
.large_lock_type = 0,
- .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
- .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
+ .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE,
+ .shared_lock_type = SMB2_LOCKFLAG_SHARED,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
.header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
@@ -5867,8 +5924,8 @@ struct smb_version_values smbdefault_values = {
.protocol_id = SMB302_PROT_ID, /* doesn't matter, send protocol array */
.req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION | SMB2_GLOBAL_CAP_DIRECTORY_LEASING,
.large_lock_type = 0,
- .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
- .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
+ .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE,
+ .shared_lock_type = SMB2_LOCKFLAG_SHARED,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
.header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
@@ -5888,8 +5945,8 @@ struct smb_version_values smb30_values = {
.protocol_id = SMB30_PROT_ID,
.req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION | SMB2_GLOBAL_CAP_DIRECTORY_LEASING,
.large_lock_type = 0,
- .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
- .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
+ .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE,
+ .shared_lock_type = SMB2_LOCKFLAG_SHARED,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
.header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
@@ -5909,8 +5966,8 @@ struct smb_version_values smb302_values = {
.protocol_id = SMB302_PROT_ID,
.req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION | SMB2_GLOBAL_CAP_DIRECTORY_LEASING,
.large_lock_type = 0,
- .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
- .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
+ .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE,
+ .shared_lock_type = SMB2_LOCKFLAG_SHARED,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
.header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
@@ -5930,8 +5987,8 @@ struct smb_version_values smb311_values = {
.protocol_id = SMB311_PROT_ID,
.req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION | SMB2_GLOBAL_CAP_DIRECTORY_LEASING,
.large_lock_type = 0,
- .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
- .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
+ .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE,
+ .shared_lock_type = SMB2_LOCKFLAG_SHARED,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
.header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 7e7909b1ae11..1b7ad0c09566 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -163,7 +163,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
return 0;
spin_lock(&cifs_tcp_ses_lock);
- if (tcon->tidStatus == CifsExiting) {
+ if (tcon->status == TID_EXITING) {
/*
* only tree disconnect, open, and write,
* (and ulogoff which does not have tcon)
@@ -2734,13 +2734,10 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
goto err_free_req;
}
- trace_smb3_posix_mkdir_done(xid, le64_to_cpu(rsp->PersistentFileId),
- tcon->tid,
- ses->Suid, CREATE_NOT_FILE,
- FILE_WRITE_ATTRIBUTES);
+ trace_smb3_posix_mkdir_done(xid, rsp->PersistentFileId, tcon->tid, ses->Suid,
+ CREATE_NOT_FILE, FILE_WRITE_ATTRIBUTES);
- SMB2_close(xid, tcon, le64_to_cpu(rsp->PersistentFileId),
- le64_to_cpu(rsp->VolatileFileId));
+ SMB2_close(xid, tcon, rsp->PersistentFileId, rsp->VolatileFileId);
/* Eventually save off posix specific response info and timestaps */
@@ -3009,14 +3006,12 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
} else if (rsp == NULL) /* unlikely to happen, but safer to check */
goto creat_exit;
else
- trace_smb3_open_done(xid, le64_to_cpu(rsp->PersistentFileId),
- tcon->tid,
- ses->Suid, oparms->create_options,
- oparms->desired_access);
+ trace_smb3_open_done(xid, rsp->PersistentFileId, tcon->tid, ses->Suid,
+ oparms->create_options, oparms->desired_access);
atomic_inc(&tcon->num_remote_opens);
- oparms->fid->persistent_fid = le64_to_cpu(rsp->PersistentFileId);
- oparms->fid->volatile_fid = le64_to_cpu(rsp->VolatileFileId);
+ oparms->fid->persistent_fid = rsp->PersistentFileId;
+ oparms->fid->volatile_fid = rsp->VolatileFileId;
oparms->fid->access = oparms->desired_access;
#ifdef CONFIG_CIFS_DEBUG2
oparms->fid->mid = le64_to_cpu(rsp->hdr.MessageId);
@@ -3313,8 +3308,8 @@ SMB2_close_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
if (rc)
return rc;
- req->PersistentFileId = cpu_to_le64(persistent_fid);
- req->VolatileFileId = cpu_to_le64(volatile_fid);
+ req->PersistentFileId = persistent_fid;
+ req->VolatileFileId = volatile_fid;
if (query_attrs)
req->Flags = SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB;
else
@@ -3677,8 +3672,8 @@ SMB2_notify_init(const unsigned int xid, struct smb_rqst *rqst,
if (rc)
return rc;
- req->PersistentFileId = cpu_to_le64(persistent_fid);
- req->VolatileFileId = cpu_to_le64(volatile_fid);
+ req->PersistentFileId = persistent_fid;
+ req->VolatileFileId = volatile_fid;
/* See note 354 of MS-SMB2, 64K max */
req->OutputBufferLength =
cpu_to_le32(SMB2_MAX_BUFFER_SIZE - MAX_SMB2_HDR_SIZE);
@@ -3858,12 +3853,14 @@ void smb2_reconnect_server(struct work_struct *work)
tcon = kzalloc(sizeof(struct cifs_tcon), GFP_KERNEL);
if (!tcon) {
resched = true;
- list_del_init(&ses->rlist);
- cifs_put_smb_ses(ses);
+ list_for_each_entry_safe(ses, ses2, &tmp_ses_list, rlist) {
+ list_del_init(&ses->rlist);
+ cifs_put_smb_ses(ses);
+ }
goto done;
}
- tcon->tidStatus = CifsGood;
+ tcon->status = TID_GOOD;
tcon->retry = false;
tcon->need_reconnect = false;
@@ -3951,8 +3948,8 @@ SMB2_flush_init(const unsigned int xid, struct smb_rqst *rqst,
if (rc)
return rc;
- req->PersistentFileId = cpu_to_le64(persistent_fid);
- req->VolatileFileId = cpu_to_le64(volatile_fid);
+ req->PersistentFileId = persistent_fid;
+ req->VolatileFileId = volatile_fid;
iov[0].iov_base = (char *)req;
iov[0].iov_len = total_len;
@@ -4033,8 +4030,8 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
shdr = &req->hdr;
shdr->Id.SyncId.ProcessId = cpu_to_le32(io_parms->pid);
- req->PersistentFileId = cpu_to_le64(io_parms->persistent_fid);
- req->VolatileFileId = cpu_to_le64(io_parms->volatile_fid);
+ req->PersistentFileId = io_parms->persistent_fid;
+ req->VolatileFileId = io_parms->volatile_fid;
req->ReadChannelInfoOffset = 0; /* reserved */
req->ReadChannelInfoLength = 0; /* reserved */
req->Channel = 0; /* reserved */
@@ -4094,8 +4091,8 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
*/
shdr->SessionId = cpu_to_le64(0xFFFFFFFFFFFFFFFF);
shdr->Id.SyncId.TreeId = cpu_to_le32(0xFFFFFFFF);
- req->PersistentFileId = cpu_to_le64(0xFFFFFFFFFFFFFFFF);
- req->VolatileFileId = cpu_to_le64(0xFFFFFFFFFFFFFFFF);
+ req->PersistentFileId = (u64)-1;
+ req->VolatileFileId = (u64)-1;
}
}
if (remaining_bytes > io_parms->length)
@@ -4307,21 +4304,19 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE);
cifs_dbg(VFS, "Send error in read = %d\n", rc);
trace_smb3_read_err(xid,
- le64_to_cpu(req->PersistentFileId),
+ req->PersistentFileId,
io_parms->tcon->tid, ses->Suid,
io_parms->offset, io_parms->length,
rc);
} else
- trace_smb3_read_done(xid,
- le64_to_cpu(req->PersistentFileId),
- io_parms->tcon->tid, ses->Suid,
- io_parms->offset, 0);
+ trace_smb3_read_done(xid, req->PersistentFileId, io_parms->tcon->tid,
+ ses->Suid, io_parms->offset, 0);
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
cifs_small_buf_release(req);
return rc == -ENODATA ? 0 : rc;
} else
trace_smb3_read_done(xid,
- le64_to_cpu(req->PersistentFileId),
+ req->PersistentFileId,
io_parms->tcon->tid, ses->Suid,
io_parms->offset, io_parms->length);
@@ -4463,8 +4458,8 @@ smb2_async_writev(struct cifs_writedata *wdata,
shdr = (struct smb2_hdr *)req;
shdr->Id.SyncId.ProcessId = cpu_to_le32(wdata->cfile->pid);
- req->PersistentFileId = cpu_to_le64(wdata->cfile->fid.persistent_fid);
- req->VolatileFileId = cpu_to_le64(wdata->cfile->fid.volatile_fid);
+ req->PersistentFileId = wdata->cfile->fid.persistent_fid;
+ req->VolatileFileId = wdata->cfile->fid.volatile_fid;
req->WriteChannelInfoOffset = 0;
req->WriteChannelInfoLength = 0;
req->Channel = 0;
@@ -4562,7 +4557,7 @@ smb2_async_writev(struct cifs_writedata *wdata,
if (rc) {
trace_smb3_write_err(0 /* no xid */,
- le64_to_cpu(req->PersistentFileId),
+ req->PersistentFileId,
tcon->tid, tcon->ses->Suid, wdata->offset,
wdata->bytes, rc);
kref_put(&wdata->refcount, release);
@@ -4615,8 +4610,8 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
req->hdr.Id.SyncId.ProcessId = cpu_to_le32(io_parms->pid);
- req->PersistentFileId = cpu_to_le64(io_parms->persistent_fid);
- req->VolatileFileId = cpu_to_le64(io_parms->volatile_fid);
+ req->PersistentFileId = io_parms->persistent_fid;
+ req->VolatileFileId = io_parms->volatile_fid;
req->WriteChannelInfoOffset = 0;
req->WriteChannelInfoLength = 0;
req->Channel = 0;
@@ -4645,7 +4640,7 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
if (rc) {
trace_smb3_write_err(xid,
- le64_to_cpu(req->PersistentFileId),
+ req->PersistentFileId,
io_parms->tcon->tid,
io_parms->tcon->ses->Suid,
io_parms->offset, io_parms->length, rc);
@@ -4654,7 +4649,7 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
} else {
*nbytes = le32_to_cpu(rsp->DataLength);
trace_smb3_write_done(xid,
- le64_to_cpu(req->PersistentFileId),
+ req->PersistentFileId,
io_parms->tcon->tid,
io_parms->tcon->ses->Suid,
io_parms->offset, *nbytes);
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 33cfd0a1adf1..d8c4388b190d 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -56,16 +56,6 @@ struct smb2_rdma_crypto_transform {
#define COMPOUND_FID 0xFFFFFFFFFFFFFFFFULL
-#define SMB2_ERROR_STRUCTURE_SIZE2 cpu_to_le16(9)
-
-struct smb2_err_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize;
- __le16 Reserved; /* MBZ */
- __le32 ByteCount; /* even if zero, at least one byte follows */
- __u8 ErrorData[1]; /* variable length */
-} __packed;
-
#define SYMLINK_ERROR_TAG 0x4c4d5953
struct smb2_symlink_err_rsp {
@@ -139,47 +129,6 @@ struct share_redirect_error_context_rsp {
#define SMB2_LEASE_HANDLE_CACHING_HE 0x02
#define SMB2_LEASE_WRITE_CACHING_HE 0x04
-#define SMB2_LEASE_NONE cpu_to_le32(0x00)
-#define SMB2_LEASE_READ_CACHING cpu_to_le32(0x01)
-#define SMB2_LEASE_HANDLE_CACHING cpu_to_le32(0x02)
-#define SMB2_LEASE_WRITE_CACHING cpu_to_le32(0x04)
-
-#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS cpu_to_le32(0x00000002)
-#define SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET cpu_to_le32(0x00000004)
-
-#define SMB2_LEASE_KEY_SIZE 16
-
-struct lease_context {
- u8 LeaseKey[SMB2_LEASE_KEY_SIZE];
- __le32 LeaseState;
- __le32 LeaseFlags;
- __le64 LeaseDuration;
-} __packed;
-
-struct lease_context_v2 {
- u8 LeaseKey[SMB2_LEASE_KEY_SIZE];
- __le32 LeaseState;
- __le32 LeaseFlags;
- __le64 LeaseDuration;
- __le64 ParentLeaseKeyLow;
- __le64 ParentLeaseKeyHigh;
- __le16 Epoch;
- __le16 Reserved;
-} __packed;
-
-struct create_lease {
- struct create_context ccontext;
- __u8 Name[8];
- struct lease_context lcontext;
-} __packed;
-
-struct create_lease_v2 {
- struct create_context ccontext;
- __u8 Name[8];
- struct lease_context_v2 lcontext;
- __u8 Pad[4];
-} __packed;
-
struct create_durable {
struct create_context ccontext;
__u8 Name[8];
@@ -192,13 +141,6 @@ struct create_durable {
} Data;
} __packed;
-struct create_posix {
- struct create_context ccontext;
- __u8 Name[16];
- __le32 Mode;
- __u32 Reserved;
-} __packed;
-
/* See MS-SMB2 2.2.13.2.11 */
/* Flags */
#define SMB2_DHANDLE_FLAG_PERSISTENT 0x00000002
@@ -287,12 +229,6 @@ struct copychunk_ioctl {
__u32 Reserved2;
} __packed;
-/* this goes in the ioctl buffer when doing FSCTL_SET_ZERO_DATA */
-struct file_zero_data_information {
- __le64 FileOffset;
- __le64 BeyondFinalZero;
-} __packed;
-
struct copychunk_ioctl_rsp {
__le32 ChunksWritten;
__le32 ChunkBytesWritten;
@@ -338,11 +274,6 @@ struct fsctl_get_integrity_information_rsp {
__le32 ClusterSizeInBytes;
} __packed;
-struct file_allocated_range_buffer {
- __le64 file_offset;
- __le64 length;
-} __packed;
-
/* Integrity ChecksumAlgorithm choices for above */
#define CHECKSUM_TYPE_NONE 0x0000
#define CHECKSUM_TYPE_CRC64 0x0002
@@ -351,53 +282,6 @@ struct file_allocated_range_buffer {
/* Integrity flags for above */
#define FSCTL_INTEGRITY_FLAG_CHECKSUM_ENFORCEMENT_OFF 0x00000001
-/* Reparse structures - see MS-FSCC 2.1.2 */
-
-/* struct fsctl_reparse_info_req is empty, only response structs (see below) */
-
-struct reparse_data_buffer {
- __le32 ReparseTag;
- __le16 ReparseDataLength;
- __u16 Reserved;
- __u8 DataBuffer[]; /* Variable Length */
-} __packed;
-
-struct reparse_guid_data_buffer {
- __le32 ReparseTag;
- __le16 ReparseDataLength;
- __u16 Reserved;
- __u8 ReparseGuid[16];
- __u8 DataBuffer[]; /* Variable Length */
-} __packed;
-
-struct reparse_mount_point_data_buffer {
- __le32 ReparseTag;
- __le16 ReparseDataLength;
- __u16 Reserved;
- __le16 SubstituteNameOffset;
- __le16 SubstituteNameLength;
- __le16 PrintNameOffset;
- __le16 PrintNameLength;
- __u8 PathBuffer[]; /* Variable Length */
-} __packed;
-
-#define SYMLINK_FLAG_RELATIVE 0x00000001
-
-struct reparse_symlink_data_buffer {
- __le32 ReparseTag;
- __le16 ReparseDataLength;
- __u16 Reserved;
- __le16 SubstituteNameOffset;
- __le16 SubstituteNameLength;
- __le16 PrintNameOffset;
- __le16 PrintNameLength;
- __le32 Flags;
- __u8 PathBuffer[]; /* Variable Length */
-} __packed;
-
-/* See MS-FSCC 2.1.2.6 and cifspdu.h for struct reparse_posix_data */
-
-
/* See MS-DFSC 2.2.2 */
struct fsctl_get_dfs_referral_req {
__le16 MaxReferralLevel;
@@ -413,22 +297,6 @@ struct network_resiliency_req {
} __packed;
/* There is no buffer for the response ie no struct network_resiliency_rsp */
-
-struct validate_negotiate_info_req {
- __le32 Capabilities;
- __u8 Guid[SMB2_CLIENT_GUID_SIZE];
- __le16 SecurityMode;
- __le16 DialectCount;
- __le16 Dialects[4]; /* BB expand this if autonegotiate > 4 dialects */
-} __packed;
-
-struct validate_negotiate_info_rsp {
- __le32 Capabilities;
- __u8 Guid[SMB2_CLIENT_GUID_SIZE];
- __le16 SecurityMode;
- __le16 Dialect; /* Dialect in use for the connection */
-} __packed;
-
#define RSS_CAPABLE cpu_to_le32(0x00000001)
#define RDMA_CAPABLE cpu_to_le32(0x00000002)
@@ -464,14 +332,6 @@ struct compress_ioctl {
__le16 CompressionState; /* See cifspdu.h for possible flag values */
} __packed;
-struct duplicate_extents_to_file {
- __u64 PersistentFileHandle; /* source file handle, opaque endianness */
- __u64 VolatileFileHandle;
- __le64 SourceFileOffset;
- __le64 TargetFileOffset;
- __le64 ByteCount; /* Bytes to be copied */
-} __packed;
-
/*
* Maximum number of iovs we need for an ioctl request.
* [0] : struct smb2_ioctl_req
@@ -479,370 +339,11 @@ struct duplicate_extents_to_file {
*/
#define SMB2_IOCTL_IOV_SIZE 2
-struct smb2_ioctl_req {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 57 */
- __u16 Reserved;
- __le32 CtlCode;
- __u64 PersistentFileId; /* opaque endianness */
- __u64 VolatileFileId; /* opaque endianness */
- __le32 InputOffset;
- __le32 InputCount;
- __le32 MaxInputResponse;
- __le32 OutputOffset;
- __le32 OutputCount;
- __le32 MaxOutputResponse;
- __le32 Flags;
- __u32 Reserved2;
- __u8 Buffer[];
-} __packed;
-
-struct smb2_ioctl_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 57 */
- __u16 Reserved;
- __le32 CtlCode;
- __u64 PersistentFileId; /* opaque endianness */
- __u64 VolatileFileId; /* opaque endianness */
- __le32 InputOffset;
- __le32 InputCount;
- __le32 OutputOffset;
- __le32 OutputCount;
- __le32 Flags;
- __u32 Reserved2;
- /* char * buffer[] */
-} __packed;
-
-#define SMB2_LOCKFLAG_SHARED_LOCK 0x0001
-#define SMB2_LOCKFLAG_EXCLUSIVE_LOCK 0x0002
-#define SMB2_LOCKFLAG_UNLOCK 0x0004
-#define SMB2_LOCKFLAG_FAIL_IMMEDIATELY 0x0010
-
-struct smb2_lock_element {
- __le64 Offset;
- __le64 Length;
- __le32 Flags;
- __le32 Reserved;
-} __packed;
-
-struct smb2_lock_req {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 48 */
- __le16 LockCount;
- /*
- * The least significant four bits are the index, the other 28 bits are
- * the lock sequence number (0 to 64). See MS-SMB2 2.2.26
- */
- __le32 LockSequenceNumber;
- __u64 PersistentFileId; /* opaque endianness */
- __u64 VolatileFileId; /* opaque endianness */
- /* Followed by at least one */
- struct smb2_lock_element locks[1];
-} __packed;
-
-struct smb2_lock_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 4 */
- __le16 Reserved;
-} __packed;
-
-struct smb2_echo_req {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 4 */
- __u16 Reserved;
-} __packed;
-
-struct smb2_echo_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 4 */
- __u16 Reserved;
-} __packed;
-
-/* search (query_directory) Flags field */
-#define SMB2_RESTART_SCANS 0x01
-#define SMB2_RETURN_SINGLE_ENTRY 0x02
-#define SMB2_INDEX_SPECIFIED 0x04
-#define SMB2_REOPEN 0x10
-
-#define SMB2_QUERY_DIRECTORY_IOV_SIZE 2
-
-/*
- * Valid FileInformation classes.
- *
- * Note that these are a subset of the (file) QUERY_INFO levels defined
- * later in this file (but since QUERY_DIRECTORY uses equivalent numbers
- * we do not redefine them here)
- *
- * FileDirectoryInfomation 0x01
- * FileFullDirectoryInformation 0x02
- * FileIdFullDirectoryInformation 0x26
- * FileBothDirectoryInformation 0x03
- * FileIdBothDirectoryInformation 0x25
- * FileNamesInformation 0x0C
- * FileIdExtdDirectoryInformation 0x3C
- */
-
-struct smb2_query_directory_req {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 33 */
- __u8 FileInformationClass;
- __u8 Flags;
- __le32 FileIndex;
- __u64 PersistentFileId; /* opaque endianness */
- __u64 VolatileFileId; /* opaque endianness */
- __le16 FileNameOffset;
- __le16 FileNameLength;
- __le32 OutputBufferLength;
- __u8 Buffer[1];
-} __packed;
-
-struct smb2_query_directory_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 9 */
- __le16 OutputBufferOffset;
- __le32 OutputBufferLength;
- __u8 Buffer[1];
-} __packed;
-
-/* Possible InfoType values */
-#define SMB2_O_INFO_FILE 0x01
-#define SMB2_O_INFO_FILESYSTEM 0x02
-#define SMB2_O_INFO_SECURITY 0x03
-#define SMB2_O_INFO_QUOTA 0x04
-
-/* Security info type additionalinfo flags. See MS-SMB2 (2.2.37) or MS-DTYP */
-#define OWNER_SECINFO 0x00000001
-#define GROUP_SECINFO 0x00000002
-#define DACL_SECINFO 0x00000004
-#define SACL_SECINFO 0x00000008
-#define LABEL_SECINFO 0x00000010
-#define ATTRIBUTE_SECINFO 0x00000020
-#define SCOPE_SECINFO 0x00000040
-#define BACKUP_SECINFO 0x00010000
-#define UNPROTECTED_SACL_SECINFO 0x10000000
-#define UNPROTECTED_DACL_SECINFO 0x20000000
-#define PROTECTED_SACL_SECINFO 0x40000000
-#define PROTECTED_DACL_SECINFO 0x80000000
-
-/* Flags used for FileFullEAinfo */
-#define SL_RESTART_SCAN 0x00000001
-#define SL_RETURN_SINGLE_ENTRY 0x00000002
-#define SL_INDEX_SPECIFIED 0x00000004
-
-struct smb2_query_info_req {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 41 */
- __u8 InfoType;
- __u8 FileInfoClass;
- __le32 OutputBufferLength;
- __le16 InputBufferOffset;
- __u16 Reserved;
- __le32 InputBufferLength;
- __le32 AdditionalInformation;
- __le32 Flags;
- __u64 PersistentFileId; /* opaque endianness */
- __u64 VolatileFileId; /* opaque endianness */
- __u8 Buffer[1];
-} __packed;
-
-struct smb2_query_info_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 9 */
- __le16 OutputBufferOffset;
- __le32 OutputBufferLength;
- __u8 Buffer[1];
-} __packed;
-
-/*
- * Maximum number of iovs we need for a set-info request.
- * The largest one is rename/hardlink
- * [0] : struct smb2_set_info_req + smb2_file_[rename|link]_info
- * [1] : path
- * [2] : compound padding
- */
-#define SMB2_SET_INFO_IOV_SIZE 3
-
-struct smb2_set_info_req {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 33 */
- __u8 InfoType;
- __u8 FileInfoClass;
- __le32 BufferLength;
- __le16 BufferOffset;
- __u16 Reserved;
- __le32 AdditionalInformation;
- __u64 PersistentFileId; /* opaque endianness */
- __u64 VolatileFileId; /* opaque endianness */
- __u8 Buffer[1];
-} __packed;
-
-struct smb2_set_info_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 2 */
-} __packed;
-
-struct smb2_oplock_break {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 24 */
- __u8 OplockLevel;
- __u8 Reserved;
- __le32 Reserved2;
- __u64 PersistentFid;
- __u64 VolatileFid;
-} __packed;
-
-#define SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED cpu_to_le32(0x01)
-
-struct smb2_lease_break {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 44 */
- __le16 Epoch;
- __le32 Flags;
- __u8 LeaseKey[16];
- __le32 CurrentLeaseState;
- __le32 NewLeaseState;
- __le32 BreakReason;
- __le32 AccessMaskHint;
- __le32 ShareMaskHint;
-} __packed;
-
-struct smb2_lease_ack {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 36 */
- __le16 Reserved;
- __le32 Flags;
- __u8 LeaseKey[16];
- __le32 LeaseState;
- __le64 LeaseDuration;
-} __packed;
-
/*
- * PDU infolevel structure definitions
+ * PDU query infolevel structure definitions
* BB consider moving to a different header
*/
-/* File System Information Classes */
-#define FS_VOLUME_INFORMATION 1 /* Query */
-#define FS_LABEL_INFORMATION 2 /* Local only */
-#define FS_SIZE_INFORMATION 3 /* Query */
-#define FS_DEVICE_INFORMATION 4 /* Query */
-#define FS_ATTRIBUTE_INFORMATION 5 /* Query */
-#define FS_CONTROL_INFORMATION 6 /* Query, Set */
-#define FS_FULL_SIZE_INFORMATION 7 /* Query */
-#define FS_OBJECT_ID_INFORMATION 8 /* Query, Set */
-#define FS_DRIVER_PATH_INFORMATION 9 /* Local only */
-#define FS_VOLUME_FLAGS_INFORMATION 10 /* Local only */
-#define FS_SECTOR_SIZE_INFORMATION 11 /* SMB3 or later. Query */
-#define FS_POSIX_INFORMATION 100 /* SMB3.1.1 POSIX. Query */
-
-struct smb2_fs_full_size_info {
- __le64 TotalAllocationUnits;
- __le64 CallerAvailableAllocationUnits;
- __le64 ActualAvailableAllocationUnits;
- __le32 SectorsPerAllocationUnit;
- __le32 BytesPerSector;
-} __packed;
-
-#define SSINFO_FLAGS_ALIGNED_DEVICE 0x00000001
-#define SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE 0x00000002
-#define SSINFO_FLAGS_NO_SEEK_PENALTY 0x00000004
-#define SSINFO_FLAGS_TRIM_ENABLED 0x00000008
-
-/* sector size info struct */
-struct smb3_fs_ss_info {
- __le32 LogicalBytesPerSector;
- __le32 PhysicalBytesPerSectorForAtomicity;
- __le32 PhysicalBytesPerSectorForPerf;
- __le32 FileSystemEffectivePhysicalBytesPerSectorForAtomicity;
- __le32 Flags;
- __le32 ByteOffsetForSectorAlignment;
- __le32 ByteOffsetForPartitionAlignment;
-} __packed;
-
-/* volume info struct - see MS-FSCC 2.5.9 */
-#define MAX_VOL_LABEL_LEN 32
-struct smb3_fs_vol_info {
- __le64 VolumeCreationTime;
- __u32 VolumeSerialNumber;
- __le32 VolumeLabelLength; /* includes trailing null */
- __u8 SupportsObjects; /* True if eg like NTFS, supports objects */
- __u8 Reserved;
- __u8 VolumeLabel[]; /* variable len */
-} __packed;
-
-/* partial list of QUERY INFO levels */
-#define FILE_DIRECTORY_INFORMATION 1
-#define FILE_FULL_DIRECTORY_INFORMATION 2
-#define FILE_BOTH_DIRECTORY_INFORMATION 3
-#define FILE_BASIC_INFORMATION 4
-#define FILE_STANDARD_INFORMATION 5
-#define FILE_INTERNAL_INFORMATION 6
-#define FILE_EA_INFORMATION 7
-#define FILE_ACCESS_INFORMATION 8
-#define FILE_NAME_INFORMATION 9
-#define FILE_RENAME_INFORMATION 10
-#define FILE_LINK_INFORMATION 11
-#define FILE_NAMES_INFORMATION 12
-#define FILE_DISPOSITION_INFORMATION 13
-#define FILE_POSITION_INFORMATION 14
-#define FILE_FULL_EA_INFORMATION 15
-#define FILE_MODE_INFORMATION 16
-#define FILE_ALIGNMENT_INFORMATION 17
-#define FILE_ALL_INFORMATION 18
-#define FILE_ALLOCATION_INFORMATION 19
-#define FILE_END_OF_FILE_INFORMATION 20
-#define FILE_ALTERNATE_NAME_INFORMATION 21
-#define FILE_STREAM_INFORMATION 22
-#define FILE_PIPE_INFORMATION 23
-#define FILE_PIPE_LOCAL_INFORMATION 24
-#define FILE_PIPE_REMOTE_INFORMATION 25
-#define FILE_MAILSLOT_QUERY_INFORMATION 26
-#define FILE_MAILSLOT_SET_INFORMATION 27
-#define FILE_COMPRESSION_INFORMATION 28
-#define FILE_OBJECT_ID_INFORMATION 29
-/* Number 30 not defined in documents */
-#define FILE_MOVE_CLUSTER_INFORMATION 31
-#define FILE_QUOTA_INFORMATION 32
-#define FILE_REPARSE_POINT_INFORMATION 33
-#define FILE_NETWORK_OPEN_INFORMATION 34
-#define FILE_ATTRIBUTE_TAG_INFORMATION 35
-#define FILE_TRACKING_INFORMATION 36
-#define FILEID_BOTH_DIRECTORY_INFORMATION 37
-#define FILEID_FULL_DIRECTORY_INFORMATION 38
-#define FILE_VALID_DATA_LENGTH_INFORMATION 39
-#define FILE_SHORT_NAME_INFORMATION 40
-#define FILE_SFIO_RESERVE_INFORMATION 44
-#define FILE_SFIO_VOLUME_INFORMATION 45
-#define FILE_HARD_LINK_INFORMATION 46
-#define FILE_NORMALIZED_NAME_INFORMATION 48
-#define FILEID_GLOBAL_TX_DIRECTORY_INFORMATION 50
-#define FILE_STANDARD_LINK_INFORMATION 54
-#define FILE_ID_INFORMATION 59
-#define FILE_ID_EXTD_DIRECTORY_INFORMATION 60
-
-struct smb2_file_internal_info {
- __le64 IndexNumber;
-} __packed; /* level 6 Query */
-
-struct smb2_file_rename_info { /* encoding of request for level 10 */
- __u8 ReplaceIfExists; /* 1 = replace existing target with new */
- /* 0 = fail if target already exists */
- __u8 Reserved[7];
- __u64 RootDirectory; /* MBZ for network operations (why says spec?) */
- __le32 FileNameLength;
- char FileName[]; /* New name to be assigned */
- /* padding - overall struct size must be >= 24 so filename + pad >= 6 */
-} __packed; /* level 10 Set */
-
-struct smb2_file_link_info { /* encoding of request for level 11 */
- __u8 ReplaceIfExists; /* 1 = replace existing link with new */
- /* 0 = fail if link already exists */
- __u8 Reserved[7];
- __u64 RootDirectory; /* MBZ for network operations (why says spec?) */
- __le32 FileNameLength;
- char FileName[]; /* Name to be assigned to new link */
-} __packed; /* level 11 Set */
-
struct smb2_file_full_ea_info { /* encoding of response for level 15 */
__le32 next_entry_offset;
__u8 flags;
@@ -851,38 +352,6 @@ struct smb2_file_full_ea_info { /* encoding of response for level 15 */
char ea_data[]; /* \0 terminated name plus value */
} __packed; /* level 15 Set */
-/*
- * This level 18, although with struct with same name is different from cifs
- * level 0x107. Level 0x107 has an extra u64 between AccessFlags and
- * CurrentByteOffset.
- */
-struct smb2_file_all_info { /* data block encoding of response to level 18 */
- __le64 CreationTime; /* Beginning of FILE_BASIC_INFO equivalent */
- __le64 LastAccessTime;
- __le64 LastWriteTime;
- __le64 ChangeTime;
- __le32 Attributes;
- __u32 Pad1; /* End of FILE_BASIC_INFO_INFO equivalent */
- __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */
- __le64 EndOfFile; /* size ie offset to first free byte in file */
- __le32 NumberOfLinks; /* hard links */
- __u8 DeletePending;
- __u8 Directory;
- __u16 Pad2; /* End of FILE_STANDARD_INFO equivalent */
- __le64 IndexNumber;
- __le32 EASize;
- __le32 AccessFlags;
- __le64 CurrentByteOffset;
- __le32 Mode;
- __le32 AlignmentRequirement;
- __le32 FileNameLength;
- char FileName[1];
-} __packed; /* level 18 Query */
-
-struct smb2_file_eof_info { /* encoding of request for level 10 */
- __le64 EndOfFile; /* new end of file value */
-} __packed; /* level 20 Set */
-
struct smb2_file_reparse_point_info {
__le64 IndexNumber;
__le32 Tag;
@@ -935,6 +404,8 @@ struct create_posix_rsp {
struct cifs_sid group; /* var-sized on the wire */
} __packed;
+#define SMB2_QUERY_DIRECTORY_IOV_SIZE 2
+
/*
* SMB2-only POSIX info level for query dir
*
@@ -966,31 +437,6 @@ struct smb2_posix_info {
*/
} __packed;
-/* Level 100 query info */
-struct smb311_posix_qinfo {
- __le64 CreationTime;
- __le64 LastAccessTime;
- __le64 LastWriteTime;
- __le64 ChangeTime;
- __le64 EndOfFile;
- __le64 AllocationSize;
- __le32 DosAttributes;
- __le64 Inode;
- __le32 DeviceId;
- __le32 Zero;
- /* beginning of POSIX Create Context Response */
- __le32 HardLinks;
- __le32 ReparseTag;
- __le32 Mode;
- u8 Sids[];
- /*
- * var sized owner SID
- * var sized group SID
- * le32 filenamelength
- * u8 filename[]
- */
-} __packed;
-
/*
* Parsed version of the above struct. Allows direct access to the
* variable length fields
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 4a7062fd1c26..a69f1eed1cfe 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -283,7 +283,7 @@ extern int smb311_update_preauth_hash(struct cifs_ses *ses,
struct kvec *iov, int nvec);
extern int smb2_query_info_compound(const unsigned int xid,
struct cifs_tcon *tcon,
- __le16 *utf16_path, u32 desired_access,
+ const char *path, u32 desired_access,
u32 class, u32 type, u32 output_len,
struct kvec *rsp, int *buftype,
struct cifs_sb_info *cifs_sb);
diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h
index 6cecf302dcfd..bc279616c513 100644
--- a/fs/cifs/trace.h
+++ b/fs/cifs/trace.h
@@ -1006,6 +1006,13 @@ DEFINE_SMB3_CREDIT_EVENT(credit_timeout);
DEFINE_SMB3_CREDIT_EVENT(insufficient_credits);
DEFINE_SMB3_CREDIT_EVENT(too_many_credits);
DEFINE_SMB3_CREDIT_EVENT(add_credits);
+DEFINE_SMB3_CREDIT_EVENT(adj_credits);
+DEFINE_SMB3_CREDIT_EVENT(hdr_credits);
+DEFINE_SMB3_CREDIT_EVENT(nblk_credits);
+DEFINE_SMB3_CREDIT_EVENT(pend_credits);
+DEFINE_SMB3_CREDIT_EVENT(wait_credits);
+DEFINE_SMB3_CREDIT_EVENT(waitff_credits);
+DEFINE_SMB3_CREDIT_EVENT(overflow_credits);
DEFINE_SMB3_CREDIT_EVENT(set_credits);
#endif /* _CIFS_TRACE_H */
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index a4c3e027cca2..c667e6ddfe2f 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -430,7 +430,7 @@ unmask:
* be taken as the remainder of this one. We need to kill the
* socket so the server throws away the partial SMB
*/
- cifs_mark_tcp_ses_conns_for_reconnect(server, false);
+ cifs_signal_cifsd_for_reconnect(server, false);
trace_smb3_partial_send_reconnect(server->CurrentMid,
server->conn_id, server->hostname);
}
@@ -464,13 +464,12 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
return -EIO;
}
- tr_hdr = kmalloc(sizeof(*tr_hdr), GFP_NOFS);
+ tr_hdr = kzalloc(sizeof(*tr_hdr), GFP_NOFS);
if (!tr_hdr)
return -ENOMEM;
memset(&cur_rqst[0], 0, sizeof(cur_rqst));
memset(&iov, 0, sizeof(iov));
- memset(tr_hdr, 0, sizeof(*tr_hdr));
iov.iov_base = tr_hdr;
iov.iov_len = sizeof(*tr_hdr);
@@ -542,7 +541,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits,
in_flight = server->in_flight;
spin_unlock(&server->req_lock);
- trace_smb3_add_credits(server->CurrentMid,
+ trace_smb3_nblk_credits(server->CurrentMid,
server->conn_id, server->hostname, scredits, -1, in_flight);
cifs_dbg(FYI, "%s: remove %u credits total=%d\n",
__func__, 1, scredits);
@@ -648,7 +647,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits,
in_flight = server->in_flight;
spin_unlock(&server->req_lock);
- trace_smb3_add_credits(server->CurrentMid,
+ trace_smb3_waitff_credits(server->CurrentMid,
server->conn_id, server->hostname, scredits,
-(num_credits), in_flight);
cifs_dbg(FYI, "%s: remove %u credits total=%d\n",
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 29dd87be2fb8..3f3c81e6b1ab 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -14,6 +14,7 @@
#include <linux/time.h>
#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/pagemap.h>
#include <linux/stat.h>
#include <linux/cred.h>
#include <linux/errno.h>
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index d9f1bd7153df..2185328b65c7 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -43,7 +43,7 @@ static struct kmem_cache * coda_inode_cachep;
static struct inode *coda_alloc_inode(struct super_block *sb)
{
struct coda_inode_info *ei;
- ei = kmem_cache_alloc(coda_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, coda_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
memset(&ei->c_fid, 0, sizeof(struct CodaFid));
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index 95e72d271b95..8f0af4f62631 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -135,6 +135,8 @@
#define elf_format compat_elf_format
#define init_elf_binfmt init_compat_elf_binfmt
#define exit_elf_binfmt exit_compat_elf_binfmt
+#define binfmt_elf_test_cases compat_binfmt_elf_test_cases
+#define binfmt_elf_test_suite compat_binfmt_elf_test_suite
/*
* We share all the actual code with the native (64-bit) version.
diff --git a/fs/coredump.c b/fs/coredump.c
index 1c060c0a2d72..ebc43f960b64 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -31,7 +31,6 @@
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/audit.h>
-#include <linux/tracehook.h>
#include <linux/kmod.h>
#include <linux/fsnotify.h>
#include <linux/fs_struct.h>
@@ -42,6 +41,7 @@
#include <linux/path.h>
#include <linux/timekeeping.h>
#include <linux/sysctl.h>
+#include <linux/elf.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
@@ -53,6 +53,9 @@
#include <trace/events/sched.h>
+static bool dump_vma_snapshot(struct coredump_params *cprm);
+static void free_vma_snapshot(struct coredump_params *cprm);
+
static int core_uses_pid;
static unsigned int core_pipe_limit;
static char core_pattern[CORENAME_MAX_SIZE] = "core";
@@ -531,6 +534,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
* by any locks.
*/
.mm_flags = mm->flags,
+ .vma_meta = NULL,
};
audit_core_dumps(siginfo->si_signo);
@@ -745,6 +749,9 @@ void do_coredump(const kernel_siginfo_t *siginfo)
pr_info("Core dump to |%s disabled\n", cn.corename);
goto close_fail;
}
+ if (!dump_vma_snapshot(&cprm))
+ goto close_fail;
+
file_start_write(cprm.file);
core_dumped = binfmt->core_dump(&cprm);
/*
@@ -758,6 +765,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
dump_emit(&cprm, "", 1);
}
file_end_write(cprm.file);
+ free_vma_snapshot(&cprm);
}
if (ispipe && core_pipe_limit)
wait_for_dump_helpers(cprm.file);
@@ -980,6 +988,8 @@ static bool always_dump_vma(struct vm_area_struct *vma)
return false;
}
+#define DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER 1
+
/*
* Decide how much of @vma's contents should be included in a core dump.
*/
@@ -1039,9 +1049,20 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
* dump the first page to aid in determining what was mapped here.
*/
if (FILTER(ELF_HEADERS) &&
- vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ) &&
- (READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0)
- return PAGE_SIZE;
+ vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) {
+ if ((READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0)
+ return PAGE_SIZE;
+
+ /*
+ * ELF libraries aren't always executable.
+ * We'll want to check whether the mapping starts with the ELF
+ * magic, but not now - we're holding the mmap lock,
+ * so copy_from_user() doesn't work here.
+ * Use a placeholder instead, and fix it up later in
+ * dump_vma_snapshot().
+ */
+ return DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER;
+ }
#undef FILTER
@@ -1078,18 +1099,29 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
return gate_vma;
}
+static void free_vma_snapshot(struct coredump_params *cprm)
+{
+ if (cprm->vma_meta) {
+ int i;
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct file *file = cprm->vma_meta[i].file;
+ if (file)
+ fput(file);
+ }
+ kvfree(cprm->vma_meta);
+ cprm->vma_meta = NULL;
+ }
+}
+
/*
* Under the mmap_lock, take a snapshot of relevant information about the task's
* VMAs.
*/
-int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count,
- struct core_vma_metadata **vma_meta,
- size_t *vma_data_size_ptr)
+static bool dump_vma_snapshot(struct coredump_params *cprm)
{
struct vm_area_struct *vma, *gate_vma;
struct mm_struct *mm = current->mm;
int i;
- size_t vma_data_size = 0;
/*
* Once the stack expansion code is fixed to not change VMA bounds
@@ -1097,36 +1129,51 @@ int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count,
* mmap_lock in read mode.
*/
if (mmap_write_lock_killable(mm))
- return -EINTR;
+ return false;
+ cprm->vma_data_size = 0;
gate_vma = get_gate_vma(mm);
- *vma_count = mm->map_count + (gate_vma ? 1 : 0);
+ cprm->vma_count = mm->map_count + (gate_vma ? 1 : 0);
- *vma_meta = kvmalloc_array(*vma_count, sizeof(**vma_meta), GFP_KERNEL);
- if (!*vma_meta) {
+ cprm->vma_meta = kvmalloc_array(cprm->vma_count, sizeof(*cprm->vma_meta), GFP_KERNEL);
+ if (!cprm->vma_meta) {
mmap_write_unlock(mm);
- return -ENOMEM;
+ return false;
}
for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
vma = next_vma(vma, gate_vma), i++) {
- struct core_vma_metadata *m = (*vma_meta) + i;
+ struct core_vma_metadata *m = cprm->vma_meta + i;
m->start = vma->vm_start;
m->end = vma->vm_end;
m->flags = vma->vm_flags;
m->dump_size = vma_dump_size(vma, cprm->mm_flags);
+ m->pgoff = vma->vm_pgoff;
- vma_data_size += m->dump_size;
+ m->file = vma->vm_file;
+ if (m->file)
+ get_file(m->file);
}
mmap_write_unlock(mm);
- if (WARN_ON(i != *vma_count)) {
- kvfree(*vma_meta);
- return -EFAULT;
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct core_vma_metadata *m = cprm->vma_meta + i;
+
+ if (m->dump_size == DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER) {
+ char elfmag[SELFMAG];
+
+ if (copy_from_user(elfmag, (void __user *)m->start, SELFMAG) ||
+ memcmp(elfmag, ELFMAG, SELFMAG) != 0) {
+ m->dump_size = 0;
+ } else {
+ m->dump_size = PAGE_SIZE;
+ }
+ }
+
+ cprm->vma_data_size += m->dump_size;
}
- *vma_data_size_ptr = vma_data_size;
- return 0;
+ return true;
}
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index bfc2a5b74ed3..2217fe5ece6f 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -54,7 +54,8 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode,
int num_pages = 0;
/* This always succeeds since __GFP_DIRECT_RECLAIM is set. */
- bio = bio_alloc(GFP_NOFS, BIO_MAX_VECS);
+ bio = bio_alloc(inode->i_sb->s_bdev, BIO_MAX_VECS, REQ_OP_WRITE,
+ GFP_NOFS);
while (len) {
unsigned int blocks_this_page = min(len, blocks_per_page);
@@ -62,10 +63,8 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode,
if (num_pages == 0) {
fscrypt_set_bio_crypt_ctx(bio, inode, lblk, GFP_NOFS);
- bio_set_dev(bio, inode->i_sb->s_bdev);
bio->bi_iter.bi_sector =
pblk << (blockbits - SECTOR_SHIFT);
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
}
ret = bio_add_page(bio, ZERO_PAGE(0), bytes_this_page, 0);
if (WARN_ON(ret != bytes_this_page)) {
@@ -81,7 +80,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode,
err = submit_bio_wait(bio);
if (err)
goto out;
- bio_reset(bio);
+ bio_reset(bio, inode->i_sb->s_bdev, REQ_OP_WRITE);
num_pages = 0;
}
}
@@ -150,12 +149,10 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
return -EINVAL;
/* This always succeeds since __GFP_DIRECT_RECLAIM is set. */
- bio = bio_alloc(GFP_NOFS, nr_pages);
+ bio = bio_alloc(inode->i_sb->s_bdev, nr_pages, REQ_OP_WRITE, GFP_NOFS);
do {
- bio_set_dev(bio, inode->i_sb->s_bdev);
bio->bi_iter.bi_sector = pblk << (blockbits - 9);
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
i = 0;
offset = 0;
@@ -182,7 +179,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
err = submit_bio_wait(bio);
if (err)
goto out;
- bio_reset(bio);
+ bio_reset(bio, inode->i_sb->s_bdev, REQ_OP_WRITE);
} while (len != 0);
err = 0;
out:
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 4ef3f714046a..526a4c1bed99 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -69,6 +69,14 @@ void fscrypt_free_bounce_page(struct page *bounce_page)
}
EXPORT_SYMBOL(fscrypt_free_bounce_page);
+/*
+ * Generate the IV for the given logical block number within the given file.
+ * For filenames encryption, lblk_num == 0.
+ *
+ * Keep this in sync with fscrypt_limit_io_blocks(). fscrypt_limit_io_blocks()
+ * needs to know about any IV generation methods where the low bits of IV don't
+ * simply contain the lblk_num (e.g., IV_INO_LBLK_32).
+ */
void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
const struct fscrypt_info *ci)
{
@@ -240,7 +248,7 @@ EXPORT_SYMBOL(fscrypt_encrypt_block_inplace);
* which must still be locked and not uptodate. Normally, blocksize ==
* PAGE_SIZE and the whole page is decrypted at once.
*
- * This is for use by the filesystem's ->readpages() method.
+ * This is for use by the filesystem's ->readahead() method.
*
* Return: 0 on success; -errno on failure
*/
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index c57bebfa48fe..93c2ca858092 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -17,6 +17,7 @@
#include <linux/buffer_head.h>
#include <linux/sched/mm.h>
#include <linux/slab.h>
+#include <linux/uio.h>
#include "fscrypt_private.h"
@@ -315,6 +316,10 @@ EXPORT_SYMBOL_GPL(fscrypt_set_bio_crypt_ctx_bh);
*
* fscrypt_set_bio_crypt_ctx() must have already been called on the bio.
*
+ * This function isn't required in cases where crypto-mergeability is ensured in
+ * another way, such as I/O targeting only a single file (and thus a single key)
+ * combined with fscrypt_limit_io_blocks() to ensure DUN contiguity.
+ *
* Return: true iff the I/O is mergeable
*/
bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode,
@@ -363,3 +368,91 @@ bool fscrypt_mergeable_bio_bh(struct bio *bio,
return fscrypt_mergeable_bio(bio, inode, next_lblk);
}
EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio_bh);
+
+/**
+ * fscrypt_dio_supported() - check whether a DIO (direct I/O) request is
+ * supported as far as encryption is concerned
+ * @iocb: the file and position the I/O is targeting
+ * @iter: the I/O data segment(s)
+ *
+ * Return: %true if there are no encryption constraints that prevent DIO from
+ * being supported; %false if DIO is unsupported. (Note that in the
+ * %true case, the filesystem might have other, non-encryption-related
+ * constraints that prevent DIO from actually being supported.)
+ */
+bool fscrypt_dio_supported(struct kiocb *iocb, struct iov_iter *iter)
+{
+ const struct inode *inode = file_inode(iocb->ki_filp);
+ const unsigned int blocksize = i_blocksize(inode);
+
+ /* If the file is unencrypted, no veto from us. */
+ if (!fscrypt_needs_contents_encryption(inode))
+ return true;
+
+ /* We only support DIO with inline crypto, not fs-layer crypto. */
+ if (!fscrypt_inode_uses_inline_crypto(inode))
+ return false;
+
+ /*
+ * Since the granularity of encryption is filesystem blocks, the file
+ * position and total I/O length must be aligned to the filesystem block
+ * size -- not just to the block device's logical block size as is
+ * traditionally the case for DIO on many filesystems.
+ *
+ * We require that the user-provided memory buffers be filesystem block
+ * aligned too. It is simpler to have a single alignment value required
+ * for all properties of the I/O, as is normally the case for DIO.
+ * Also, allowing less aligned buffers would imply that data units could
+ * cross bvecs, which would greatly complicate the I/O stack, which
+ * assumes that bios can be split at any bvec boundary.
+ */
+ if (!IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), blocksize))
+ return false;
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(fscrypt_dio_supported);
+
+/**
+ * fscrypt_limit_io_blocks() - limit I/O blocks to avoid discontiguous DUNs
+ * @inode: the file on which I/O is being done
+ * @lblk: the block at which the I/O is being started from
+ * @nr_blocks: the number of blocks we want to submit starting at @lblk
+ *
+ * Determine the limit to the number of blocks that can be submitted in a bio
+ * targeting @lblk without causing a data unit number (DUN) discontiguity.
+ *
+ * This is normally just @nr_blocks, as normally the DUNs just increment along
+ * with the logical blocks. (Or the file is not encrypted.)
+ *
+ * In rare cases, fscrypt can be using an IV generation method that allows the
+ * DUN to wrap around within logically contiguous blocks, and that wraparound
+ * will occur. If this happens, a value less than @nr_blocks will be returned
+ * so that the wraparound doesn't occur in the middle of a bio, which would
+ * cause encryption/decryption to produce wrong results.
+ *
+ * Return: the actual number of blocks that can be submitted
+ */
+u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks)
+{
+ const struct fscrypt_info *ci;
+ u32 dun;
+
+ if (!fscrypt_inode_uses_inline_crypto(inode))
+ return nr_blocks;
+
+ if (nr_blocks <= 1)
+ return nr_blocks;
+
+ ci = inode->i_crypt_info;
+ if (!(fscrypt_policy_flags(&ci->ci_policy) &
+ FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32))
+ return nr_blocks;
+
+ /* With IV_INO_LBLK_32, the DUN can wrap around from U32_MAX to 0. */
+
+ dun = ci->ci_hashed_ino + lblk;
+
+ return min_t(u64, nr_blocks, (u64)U32_MAX + 1 - dun);
+}
+EXPORT_SYMBOL_GPL(fscrypt_limit_io_blocks);
diff --git a/fs/dax.c b/fs/dax.c
index cd03485867a7..67a08a32fccb 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -11,7 +11,6 @@
#include <linux/buffer_head.h>
#include <linux/dax.h>
#include <linux/fs.h>
-#include <linux/genhd.h>
#include <linux/highmem.h>
#include <linux/memcontrol.h>
#include <linux/mm.h>
@@ -390,7 +389,7 @@ static struct page *dax_busy_page(void *entry)
}
/*
- * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page
+ * dax_lock_page - Lock the DAX entry corresponding to a page
* @page: The page whose entry we want to lock
*
* Context: Process context.
diff --git a/fs/dcache.c b/fs/dcache.c
index c84269c6e8bf..93f4f5ee07bf 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1766,7 +1766,8 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
char *dname;
int err;
- dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
+ dentry = kmem_cache_alloc_lru(dentry_cache, &sb->s_dentry_lru,
+ GFP_KERNEL);
if (!dentry)
return NULL;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 2f117c57160d..3dcf0b8b4e93 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -450,6 +450,11 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
*
* If debugfs is not enabled in the kernel, the value -%ENODEV will be
* returned.
+ *
+ * NOTE: it's expected that most callers should _ignore_ the errors returned
+ * by this function. Other debugfs functions handle the fact that the "dentry"
+ * passed to them could be an error and they don't crash in that case.
+ * Drivers should generally work fine even if debugfs fails to init anyway.
*/
struct dentry *debugfs_create_file(const char *name, umode_t mode,
struct dentry *parent, void *data,
@@ -551,6 +556,11 @@ EXPORT_SYMBOL_GPL(debugfs_create_file_size);
*
* If debugfs is not enabled in the kernel, the value -%ENODEV will be
* returned.
+ *
+ * NOTE: it's expected that most callers should _ignore_ the errors returned
+ * by this function. Other debugfs functions handle the fact that the "dentry"
+ * passed to them could be an error and they don't crash in that case.
+ * Drivers should generally work fine even if debugfs fails to init anyway.
*/
struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
{
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 654443558047..aef06e607b40 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -396,18 +396,12 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
* bio_alloc() is guaranteed to return a bio when allowed to sleep and
* we request a valid number of vectors.
*/
- bio = bio_alloc(GFP_KERNEL, nr_vecs);
-
- bio_set_dev(bio, bdev);
+ bio = bio_alloc(bdev, nr_vecs, dio->op | dio->op_flags, GFP_KERNEL);
bio->bi_iter.bi_sector = first_sector;
- bio_set_op_attrs(bio, dio->op, dio->op_flags);
if (dio->is_async)
bio->bi_end_io = dio_bio_end_aio;
else
bio->bi_end_io = dio_bio_end_io;
-
- bio->bi_write_hint = dio->iocb->ki_hint;
-
sdio->bio = bio;
sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
}
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 7d85e64ea62f..9ad61b582f07 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -540,12 +540,13 @@ const struct address_space_operations ecryptfs_aops = {
* XXX: This is pretty broken for multiple reasons: ecryptfs does not
* actually use buffer_heads, and ecryptfs will crash without
* CONFIG_BLOCK. But it matches the behavior before the default for
- * address_space_operations without the ->set_page_dirty method was
+ * address_space_operations without the ->dirty_folio method was
* cleaned up, so this is the best we can do without maintainer
* feedback.
*/
#ifdef CONFIG_BLOCK
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
#endif
.writepage = ecryptfs_writepage,
.readpage = ecryptfs_readpage,
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 39116af0390f..0b1c878317ab 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -38,7 +38,7 @@ static struct inode *ecryptfs_alloc_inode(struct super_block *sb)
struct ecryptfs_inode_info *inode_info;
struct inode *inode = NULL;
- inode_info = kmem_cache_alloc(ecryptfs_inode_info_cache, GFP_KERNEL);
+ inode_info = alloc_inode_sb(sb, ecryptfs_inode_info_cache, GFP_KERNEL);
if (unlikely(!inode_info))
goto out;
if (ecryptfs_init_crypt_stat(&inode_info->crypt_stat)) {
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 62b155b9366b..b287f47c165b 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -69,7 +69,7 @@ static struct kmem_cache * efs_inode_cachep;
static struct inode *efs_alloc_inode(struct super_block *sb)
{
struct efs_inode_info *ei;
- ei = kmem_cache_alloc(efs_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, efs_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
return &ei->vfs_inode;
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 226a57c57ee6..780db1e5f4b7 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -28,10 +28,10 @@ void erofs_put_metabuf(struct erofs_buf *buf)
buf->page = NULL;
}
-void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
- erofs_blk_t blkaddr, enum erofs_kmap_type type)
+void *erofs_bread(struct erofs_buf *buf, struct inode *inode,
+ erofs_blk_t blkaddr, enum erofs_kmap_type type)
{
- struct address_space *const mapping = sb->s_bdev->bd_inode->i_mapping;
+ struct address_space *const mapping = inode->i_mapping;
erofs_off_t offset = blknr_to_addr(blkaddr);
pgoff_t index = offset >> PAGE_SHIFT;
struct page *page = buf->page;
@@ -60,6 +60,12 @@ void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
return buf->base + (offset & ~PAGE_MASK);
}
+void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
+ erofs_blk_t blkaddr, enum erofs_kmap_type type)
+{
+ return erofs_bread(buf, sb->s_bdev->bd_inode, blkaddr, type);
+}
+
static int erofs_map_blocks_flatmode(struct inode *inode,
struct erofs_map_blocks *map,
int flags)
diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c
index eee9b0b31b63..18e59821c597 100644
--- a/fs/erofs/dir.c
+++ b/fs/erofs/dir.c
@@ -2,6 +2,7 @@
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
* https://www.huawei.com/
+ * Copyright (C) 2022, Alibaba Cloud
*/
#include "internal.h"
@@ -67,7 +68,7 @@ static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx,
static int erofs_readdir(struct file *f, struct dir_context *ctx)
{
struct inode *dir = file_inode(f);
- struct address_space *mapping = dir->i_mapping;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
const size_t dirsize = i_size_read(dir);
unsigned int i = ctx->pos / EROFS_BLKSIZ;
unsigned int ofs = ctx->pos % EROFS_BLKSIZ;
@@ -75,26 +76,19 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
bool initial = true;
while (ctx->pos < dirsize) {
- struct page *dentry_page;
struct erofs_dirent *de;
unsigned int nameoff, maxsize;
- dentry_page = read_mapping_page(mapping, i, NULL);
- if (dentry_page == ERR_PTR(-ENOMEM)) {
- err = -ENOMEM;
- break;
- } else if (IS_ERR(dentry_page)) {
+ de = erofs_bread(&buf, dir, i, EROFS_KMAP);
+ if (IS_ERR(de)) {
erofs_err(dir->i_sb,
"fail to readdir of logical block %u of nid %llu",
i, EROFS_I(dir)->nid);
- err = -EFSCORRUPTED;
+ err = PTR_ERR(de);
break;
}
- de = (struct erofs_dirent *)kmap(dentry_page);
-
nameoff = le16_to_cpu(de->nameoff);
-
if (nameoff < sizeof(struct erofs_dirent) ||
nameoff >= PAGE_SIZE) {
erofs_err(dir->i_sb,
@@ -119,10 +113,6 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
err = erofs_fill_dentries(dir, ctx, de, &ofs,
nameoff, maxsize);
skip_this:
- kunmap(dentry_page);
-
- put_page(dentry_page);
-
ctx->pos = blknr_to_addr(i) + ofs;
if (err)
@@ -130,6 +120,7 @@ skip_this:
++i;
ofs = 0;
}
+ erofs_put_metabuf(&buf);
return err < 0 ? err : 0;
}
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 3ea62c6fb00a..1238ca104f09 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -12,6 +12,7 @@
#define EROFS_SUPER_OFFSET 1024
#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001
+#define EROFS_FEATURE_COMPAT_MTIME 0x00000002
/*
* Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should
@@ -186,8 +187,8 @@ struct erofs_inode_extended {
__le32 i_uid;
__le32 i_gid;
- __le64 i_ctime;
- __le32 i_ctime_nsec;
+ __le64 i_mtime;
+ __le32 i_mtime_nsec;
__le32 i_nlink;
__u8 i_reserved2[16];
};
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index ff62f84f47d3..e8b37ba5e9ad 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -113,8 +113,8 @@ static void *erofs_read_inode(struct erofs_buf *buf,
set_nlink(inode, le32_to_cpu(die->i_nlink));
/* extended inode has its own timestamp */
- inode->i_ctime.tv_sec = le64_to_cpu(die->i_ctime);
- inode->i_ctime.tv_nsec = le32_to_cpu(die->i_ctime_nsec);
+ inode->i_ctime.tv_sec = le64_to_cpu(die->i_mtime);
+ inode->i_ctime.tv_nsec = le32_to_cpu(die->i_mtime_nsec);
inode->i_size = le64_to_cpu(die->i_size);
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 5aa2cf2c2f80..5298c4ee277d 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -479,6 +479,8 @@ struct erofs_map_dev {
extern const struct file_operations erofs_file_fops;
void erofs_unmap_metabuf(struct erofs_buf *buf);
void erofs_put_metabuf(struct erofs_buf *buf);
+void *erofs_bread(struct erofs_buf *buf, struct inode *inode,
+ erofs_blk_t blkaddr, enum erofs_kmap_type type);
void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
erofs_blk_t blkaddr, enum erofs_kmap_type type);
int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev);
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index 8629e616028c..554efa363317 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -2,6 +2,7 @@
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
* https://www.huawei.com/
+ * Copyright (C) 2022, Alibaba Cloud
*/
#include "xattr.h"
@@ -86,14 +87,14 @@ static struct erofs_dirent *find_target_dirent(struct erofs_qstr *name,
return ERR_PTR(-ENOENT);
}
-static struct page *find_target_block_classic(struct inode *dir,
- struct erofs_qstr *name,
- int *_ndirents)
+static void *find_target_block_classic(struct erofs_buf *target,
+ struct inode *dir,
+ struct erofs_qstr *name,
+ int *_ndirents)
{
unsigned int startprfx, endprfx;
int head, back;
- struct address_space *const mapping = dir->i_mapping;
- struct page *candidate = ERR_PTR(-ENOENT);
+ void *candidate = ERR_PTR(-ENOENT);
startprfx = endprfx = 0;
head = 0;
@@ -101,10 +102,11 @@ static struct page *find_target_block_classic(struct inode *dir,
while (head <= back) {
const int mid = head + (back - head) / 2;
- struct page *page = read_mapping_page(mapping, mid, NULL);
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ struct erofs_dirent *de;
- if (!IS_ERR(page)) {
- struct erofs_dirent *de = kmap_atomic(page);
+ de = erofs_bread(&buf, dir, mid, EROFS_KMAP);
+ if (!IS_ERR(de)) {
const int nameoff = nameoff_from_disk(de->nameoff,
EROFS_BLKSIZ);
const int ndirents = nameoff / sizeof(*de);
@@ -113,13 +115,12 @@ static struct page *find_target_block_classic(struct inode *dir,
struct erofs_qstr dname;
if (!ndirents) {
- kunmap_atomic(de);
- put_page(page);
+ erofs_put_metabuf(&buf);
erofs_err(dir->i_sb,
"corrupted dir block %d @ nid %llu",
mid, EROFS_I(dir)->nid);
DBG_BUGON(1);
- page = ERR_PTR(-EFSCORRUPTED);
+ de = ERR_PTR(-EFSCORRUPTED);
goto out;
}
@@ -135,7 +136,6 @@ static struct page *find_target_block_classic(struct inode *dir,
/* string comparison without already matched prefix */
diff = erofs_dirnamecmp(name, &dname, &matched);
- kunmap_atomic(de);
if (!diff) {
*_ndirents = 0;
@@ -145,11 +145,12 @@ static struct page *find_target_block_classic(struct inode *dir,
startprfx = matched;
if (!IS_ERR(candidate))
- put_page(candidate);
- candidate = page;
+ erofs_put_metabuf(target);
+ *target = buf;
+ candidate = de;
*_ndirents = ndirents;
} else {
- put_page(page);
+ erofs_put_metabuf(&buf);
back = mid - 1;
endprfx = matched;
@@ -158,8 +159,8 @@ static struct page *find_target_block_classic(struct inode *dir,
}
out: /* free if the candidate is valid */
if (!IS_ERR(candidate))
- put_page(candidate);
- return page;
+ erofs_put_metabuf(target);
+ return de;
}
return candidate;
}
@@ -169,8 +170,7 @@ int erofs_namei(struct inode *dir,
erofs_nid_t *nid, unsigned int *d_type)
{
int ndirents;
- struct page *page;
- void *data;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct erofs_dirent *de;
struct erofs_qstr qn;
@@ -181,26 +181,20 @@ int erofs_namei(struct inode *dir,
qn.end = name->name + name->len;
ndirents = 0;
- page = find_target_block_classic(dir, &qn, &ndirents);
- if (IS_ERR(page))
- return PTR_ERR(page);
+ de = find_target_block_classic(&buf, dir, &qn, &ndirents);
+ if (IS_ERR(de))
+ return PTR_ERR(de);
- data = kmap_atomic(page);
/* the target page has been mapped */
if (ndirents)
- de = find_target_dirent(&qn, data, EROFS_BLKSIZ, ndirents);
- else
- de = (struct erofs_dirent *)data;
+ de = find_target_dirent(&qn, (u8 *)de, EROFS_BLKSIZ, ndirents);
if (!IS_ERR(de)) {
*nid = le64_to_cpu(de->nid);
*d_type = de->file_type;
}
-
- kunmap_atomic(data);
- put_page(page);
-
+ erofs_put_metabuf(&buf);
return PTR_ERR_OR_ZERO(de);
}
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 915eefe0d7e2..0c4b41130c2f 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -84,7 +84,7 @@ static void erofs_inode_init_once(void *ptr)
static struct inode *erofs_alloc_inode(struct super_block *sb)
{
struct erofs_inode *vi =
- kmem_cache_alloc(erofs_inode_cachep, GFP_KERNEL);
+ alloc_inode_sb(sb, erofs_inode_cachep, GFP_KERNEL);
if (!vi)
return NULL;
@@ -281,21 +281,19 @@ static int erofs_init_devices(struct super_block *sb,
static int erofs_read_superblock(struct super_block *sb)
{
struct erofs_sb_info *sbi;
- struct page *page;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct erofs_super_block *dsb;
unsigned int blkszbits;
void *data;
int ret;
- page = read_mapping_page(sb->s_bdev->bd_inode->i_mapping, 0, NULL);
- if (IS_ERR(page)) {
+ data = erofs_read_metabuf(&buf, sb, 0, EROFS_KMAP);
+ if (IS_ERR(data)) {
erofs_err(sb, "cannot read erofs superblock");
- return PTR_ERR(page);
+ return PTR_ERR(data);
}
sbi = EROFS_SB(sb);
-
- data = kmap(page);
dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET);
ret = -EINVAL;
@@ -365,8 +363,7 @@ static int erofs_read_superblock(struct super_block *sb)
if (erofs_sb_has_ztailpacking(sbi))
erofs_info(sb, "EXPERIMENTAL compressed inline data feature in use. Use at your own risk!");
out:
- kunmap(page);
- put_page(page);
+ erofs_put_metabuf(&buf);
return ret;
}
@@ -535,25 +532,29 @@ static int erofs_managed_cache_releasepage(struct page *page, gfp_t gfp_mask)
return ret;
}
-static void erofs_managed_cache_invalidatepage(struct page *page,
- unsigned int offset,
- unsigned int length)
+/*
+ * It will be called only on inode eviction. In case that there are still some
+ * decompression requests in progress, wait with rescheduling for a bit here.
+ * We could introduce an extra locking instead but it seems unnecessary.
+ */
+static void erofs_managed_cache_invalidate_folio(struct folio *folio,
+ size_t offset, size_t length)
{
- const unsigned int stop = length + offset;
+ const size_t stop = length + offset;
- DBG_BUGON(!PageLocked(page));
+ DBG_BUGON(!folio_test_locked(folio));
/* Check for potential overflow in debug mode */
- DBG_BUGON(stop > PAGE_SIZE || stop < length);
+ DBG_BUGON(stop > folio_size(folio) || stop < length);
- if (offset == 0 && stop == PAGE_SIZE)
- while (!erofs_managed_cache_releasepage(page, GFP_NOFS))
+ if (offset == 0 && stop == folio_size(folio))
+ while (!erofs_managed_cache_releasepage(&folio->page, GFP_NOFS))
cond_resched();
}
static const struct address_space_operations managed_cache_aops = {
.releasepage = erofs_managed_cache_releasepage,
- .invalidatepage = erofs_managed_cache_invalidatepage,
+ .invalidate_folio = erofs_managed_cache_invalidate_folio,
};
static int erofs_init_managed_cache(struct super_block *sb)
@@ -568,8 +569,7 @@ static int erofs_init_managed_cache(struct super_block *sb)
inode->i_size = OFFSET_MAX;
inode->i_mapping->a_ops = &managed_cache_aops;
- mapping_set_gfp_mask(inode->i_mapping,
- GFP_NOFS | __GFP_HIGHMEM | __GFP_MOVABLE);
+ mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
sbi->managed_cache = inode;
return 0;
}
diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c
index dac252bc9228..f3babf1e6608 100644
--- a/fs/erofs/sysfs.c
+++ b/fs/erofs/sysfs.c
@@ -221,9 +221,11 @@ void erofs_unregister_sysfs(struct super_block *sb)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
- kobject_del(&sbi->s_kobj);
- kobject_put(&sbi->s_kobj);
- wait_for_completion(&sbi->s_kobj_unregister);
+ if (sbi->s_kobj.state_in_sysfs) {
+ kobject_del(&sbi->s_kobj);
+ kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
+ }
}
int __init erofs_init_sysfs(void)
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 423bc1a61da5..e6dea6dfca16 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -192,7 +192,10 @@ enum z_erofs_collectmode {
COLLECT_PRIMARY_FOLLOWED,
};
-struct z_erofs_collector {
+struct z_erofs_decompress_frontend {
+ struct inode *const inode;
+ struct erofs_map_blocks map;
+
struct z_erofs_pagevec_ctor vector;
struct z_erofs_pcluster *pcl, *tailpcl;
@@ -202,13 +205,6 @@ struct z_erofs_collector {
z_erofs_next_pcluster_t owned_head;
enum z_erofs_collectmode mode;
-};
-
-struct z_erofs_decompress_frontend {
- struct inode *const inode;
-
- struct z_erofs_collector clt;
- struct erofs_map_blocks map;
bool readahead;
/* used for applying cache strategy on the fly */
@@ -216,30 +212,30 @@ struct z_erofs_decompress_frontend {
erofs_off_t headoffset;
};
-#define COLLECTOR_INIT() { \
- .owned_head = Z_EROFS_PCLUSTER_TAIL, \
- .mode = COLLECT_PRIMARY_FOLLOWED }
-
#define DECOMPRESS_FRONTEND_INIT(__i) { \
- .inode = __i, .clt = COLLECTOR_INIT(), \
- .backmost = true, }
+ .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \
+ .mode = COLLECT_PRIMARY_FOLLOWED }
static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES];
static DEFINE_MUTEX(z_pagemap_global_lock);
-static void preload_compressed_pages(struct z_erofs_collector *clt,
- struct address_space *mc,
- enum z_erofs_cache_alloctype type,
- struct page **pagepool)
+static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
+ enum z_erofs_cache_alloctype type,
+ struct page **pagepool)
{
- struct z_erofs_pcluster *pcl = clt->pcl;
+ struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode));
+ struct z_erofs_pcluster *pcl = fe->pcl;
bool standalone = true;
+ /*
+ * optimistic allocation without direct reclaim since inplace I/O
+ * can be used if low memory otherwise.
+ */
gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) |
__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
struct page **pages;
pgoff_t index;
- if (clt->mode < COLLECT_PRIMARY_FOLLOWED)
+ if (fe->mode < COLLECT_PRIMARY_FOLLOWED)
return;
pages = pcl->compressed_pages;
@@ -288,7 +284,7 @@ static void preload_compressed_pages(struct z_erofs_collector *clt,
* managed cache since it can be moved to the bypass queue instead.
*/
if (standalone)
- clt->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE;
+ fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE;
}
/* called by erofs_shrinker to get rid of all compressed_pages */
@@ -350,47 +346,47 @@ int erofs_try_to_free_cached_page(struct page *page)
}
/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
-static bool z_erofs_try_inplace_io(struct z_erofs_collector *clt,
+static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
struct page *page)
{
- struct z_erofs_pcluster *const pcl = clt->pcl;
+ struct z_erofs_pcluster *const pcl = fe->pcl;
- while (clt->icpage_ptr > pcl->compressed_pages)
- if (!cmpxchg(--clt->icpage_ptr, NULL, page))
+ while (fe->icpage_ptr > pcl->compressed_pages)
+ if (!cmpxchg(--fe->icpage_ptr, NULL, page))
return true;
return false;
}
/* callers must be with collection lock held */
-static int z_erofs_attach_page(struct z_erofs_collector *clt,
+static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
struct page *page, enum z_erofs_page_type type,
bool pvec_safereuse)
{
int ret;
/* give priority for inplaceio */
- if (clt->mode >= COLLECT_PRIMARY &&
+ if (fe->mode >= COLLECT_PRIMARY &&
type == Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
- z_erofs_try_inplace_io(clt, page))
+ z_erofs_try_inplace_io(fe, page))
return 0;
- ret = z_erofs_pagevec_enqueue(&clt->vector, page, type,
+ ret = z_erofs_pagevec_enqueue(&fe->vector, page, type,
pvec_safereuse);
- clt->cl->vcnt += (unsigned int)ret;
+ fe->cl->vcnt += (unsigned int)ret;
return ret ? 0 : -EAGAIN;
}
-static void z_erofs_try_to_claim_pcluster(struct z_erofs_collector *clt)
+static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
{
- struct z_erofs_pcluster *pcl = clt->pcl;
- z_erofs_next_pcluster_t *owned_head = &clt->owned_head;
+ struct z_erofs_pcluster *pcl = f->pcl;
+ z_erofs_next_pcluster_t *owned_head = &f->owned_head;
/* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */
if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL,
*owned_head) == Z_EROFS_PCLUSTER_NIL) {
*owned_head = &pcl->next;
/* so we can attach this pcluster to our submission chain. */
- clt->mode = COLLECT_PRIMARY_FOLLOWED;
+ f->mode = COLLECT_PRIMARY_FOLLOWED;
return;
}
@@ -401,24 +397,24 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_collector *clt)
if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
*owned_head) == Z_EROFS_PCLUSTER_TAIL) {
*owned_head = Z_EROFS_PCLUSTER_TAIL;
- clt->mode = COLLECT_PRIMARY_HOOKED;
- clt->tailpcl = NULL;
+ f->mode = COLLECT_PRIMARY_HOOKED;
+ f->tailpcl = NULL;
return;
}
/* type 3, it belongs to a chain, but it isn't the end of the chain */
- clt->mode = COLLECT_PRIMARY;
+ f->mode = COLLECT_PRIMARY;
}
-static int z_erofs_lookup_collection(struct z_erofs_collector *clt,
+static int z_erofs_lookup_collection(struct z_erofs_decompress_frontend *fe,
struct inode *inode,
struct erofs_map_blocks *map)
{
- struct z_erofs_pcluster *pcl = clt->pcl;
+ struct z_erofs_pcluster *pcl = fe->pcl;
struct z_erofs_collection *cl;
unsigned int length;
/* to avoid unexpected loop formed by corrupted images */
- if (clt->owned_head == &pcl->next || pcl == clt->tailpcl) {
+ if (fe->owned_head == &pcl->next || pcl == fe->tailpcl) {
DBG_BUGON(1);
return -EFSCORRUPTED;
}
@@ -449,15 +445,15 @@ static int z_erofs_lookup_collection(struct z_erofs_collector *clt,
}
mutex_lock(&cl->lock);
/* used to check tail merging loop due to corrupted images */
- if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
- clt->tailpcl = pcl;
+ if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
+ fe->tailpcl = pcl;
- z_erofs_try_to_claim_pcluster(clt);
- clt->cl = cl;
+ z_erofs_try_to_claim_pcluster(fe);
+ fe->cl = cl;
return 0;
}
-static int z_erofs_register_collection(struct z_erofs_collector *clt,
+static int z_erofs_register_collection(struct z_erofs_decompress_frontend *fe,
struct inode *inode,
struct erofs_map_blocks *map)
{
@@ -485,8 +481,8 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
Z_EROFS_PCLUSTER_FULL_LENGTH : 0);
/* new pclusters should be claimed as type 1, primary and followed */
- pcl->next = clt->owned_head;
- clt->mode = COLLECT_PRIMARY_FOLLOWED;
+ pcl->next = fe->owned_head;
+ fe->mode = COLLECT_PRIMARY_FOLLOWED;
cl = z_erofs_primarycollection(pcl);
cl->pageofs = map->m_la & ~PAGE_MASK;
@@ -512,18 +508,18 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
}
if (grp != &pcl->obj) {
- clt->pcl = container_of(grp,
+ fe->pcl = container_of(grp,
struct z_erofs_pcluster, obj);
err = -EEXIST;
goto err_out;
}
}
/* used to check tail merging loop due to corrupted images */
- if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
- clt->tailpcl = pcl;
- clt->owned_head = &pcl->next;
- clt->pcl = pcl;
- clt->cl = cl;
+ if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
+ fe->tailpcl = pcl;
+ fe->owned_head = &pcl->next;
+ fe->pcl = pcl;
+ fe->cl = cl;
return 0;
err_out:
@@ -532,18 +528,18 @@ err_out:
return err;
}
-static int z_erofs_collector_begin(struct z_erofs_collector *clt,
+static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe,
struct inode *inode,
struct erofs_map_blocks *map)
{
struct erofs_workgroup *grp;
int ret;
- DBG_BUGON(clt->cl);
+ DBG_BUGON(fe->cl);
/* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous collection */
- DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_NIL);
- DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
+ DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
+ DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
if (map->m_flags & EROFS_MAP_META) {
if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
@@ -555,28 +551,28 @@ static int z_erofs_collector_begin(struct z_erofs_collector *clt,
grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT);
if (grp) {
- clt->pcl = container_of(grp, struct z_erofs_pcluster, obj);
+ fe->pcl = container_of(grp, struct z_erofs_pcluster, obj);
} else {
tailpacking:
- ret = z_erofs_register_collection(clt, inode, map);
+ ret = z_erofs_register_collection(fe, inode, map);
if (!ret)
goto out;
if (ret != -EEXIST)
return ret;
}
- ret = z_erofs_lookup_collection(clt, inode, map);
+ ret = z_erofs_lookup_collection(fe, inode, map);
if (ret) {
- erofs_workgroup_put(&clt->pcl->obj);
+ erofs_workgroup_put(&fe->pcl->obj);
return ret;
}
out:
- z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS,
- clt->cl->pagevec, clt->cl->vcnt);
+ z_erofs_pagevec_ctor_init(&fe->vector, Z_EROFS_NR_INLINE_PAGEVECS,
+ fe->cl->pagevec, fe->cl->vcnt);
/* since file-backed online pages are traversed in reverse order */
- clt->icpage_ptr = clt->pcl->compressed_pages +
- z_erofs_pclusterpages(clt->pcl);
+ fe->icpage_ptr = fe->pcl->compressed_pages +
+ z_erofs_pclusterpages(fe->pcl);
return 0;
}
@@ -610,24 +606,24 @@ static void z_erofs_collection_put(struct z_erofs_collection *cl)
erofs_workgroup_put(&pcl->obj);
}
-static bool z_erofs_collector_end(struct z_erofs_collector *clt)
+static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe)
{
- struct z_erofs_collection *cl = clt->cl;
+ struct z_erofs_collection *cl = fe->cl;
if (!cl)
return false;
- z_erofs_pagevec_ctor_exit(&clt->vector, false);
+ z_erofs_pagevec_ctor_exit(&fe->vector, false);
mutex_unlock(&cl->lock);
/*
* if all pending pages are added, don't hold its reference
* any longer if the pcluster isn't hosted by ourselves.
*/
- if (clt->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE)
+ if (fe->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE)
z_erofs_collection_put(cl);
- clt->cl = NULL;
+ fe->cl = NULL;
return true;
}
@@ -651,7 +647,6 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
struct inode *const inode = fe->inode;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
struct erofs_map_blocks *const map = &fe->map;
- struct z_erofs_collector *const clt = &fe->clt;
const loff_t offset = page_offset(page);
bool tight = true;
@@ -672,7 +667,7 @@ repeat:
if (offset + cur >= map->m_la &&
offset + cur < map->m_la + map->m_llen) {
/* didn't get a valid collection previously (very rare) */
- if (!clt->cl)
+ if (!fe->cl)
goto restart_now;
goto hitted;
}
@@ -680,7 +675,7 @@ repeat:
/* go ahead the next map_blocks */
erofs_dbg("%s: [out-of-range] pos %llu", __func__, offset + cur);
- if (z_erofs_collector_end(clt))
+ if (z_erofs_collector_end(fe))
fe->backmost = false;
map->m_la = offset + cur;
@@ -693,11 +688,11 @@ restart_now:
if (!(map->m_flags & EROFS_MAP_MAPPED))
goto hitted;
- err = z_erofs_collector_begin(clt, inode, map);
+ err = z_erofs_collector_begin(fe, inode, map);
if (err)
goto err_out;
- if (z_erofs_is_inline_pcluster(clt->pcl)) {
+ if (z_erofs_is_inline_pcluster(fe->pcl)) {
void *mp;
mp = erofs_read_metabuf(&fe->map.buf, inode->i_sb,
@@ -709,20 +704,18 @@ restart_now:
goto err_out;
}
get_page(fe->map.buf.page);
- WRITE_ONCE(clt->pcl->compressed_pages[0], fe->map.buf.page);
- clt->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE;
+ WRITE_ONCE(fe->pcl->compressed_pages[0], fe->map.buf.page);
+ fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE;
} else {
- /* preload all compressed pages (can change mode if needed) */
+ /* bind cache first when cached decompression is preferred */
if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy,
map->m_la))
cache_strategy = TRYALLOC;
else
cache_strategy = DONTALLOC;
- preload_compressed_pages(clt, MNGD_MAPPING(sbi),
- cache_strategy, pagepool);
+ z_erofs_bind_cache(fe, cache_strategy, pagepool);
}
-
hitted:
/*
* Ensure the current partial page belongs to this submit chain rather
@@ -730,8 +723,8 @@ hitted:
* those chains are handled asynchronously thus the page cannot be used
* for inplace I/O or pagevec (should be processed in strict order.)
*/
- tight &= (clt->mode >= COLLECT_PRIMARY_HOOKED &&
- clt->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE);
+ tight &= (fe->mode >= COLLECT_PRIMARY_HOOKED &&
+ fe->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE);
cur = end - min_t(unsigned int, offset + end - map->m_la, end);
if (!(map->m_flags & EROFS_MAP_MAPPED)) {
@@ -746,18 +739,18 @@ hitted:
Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED));
if (cur)
- tight &= (clt->mode >= COLLECT_PRIMARY_FOLLOWED);
+ tight &= (fe->mode >= COLLECT_PRIMARY_FOLLOWED);
retry:
- err = z_erofs_attach_page(clt, page, page_type,
- clt->mode >= COLLECT_PRIMARY_FOLLOWED);
+ err = z_erofs_attach_page(fe, page, page_type,
+ fe->mode >= COLLECT_PRIMARY_FOLLOWED);
/* should allocate an additional short-lived page for pagevec */
if (err == -EAGAIN) {
struct page *const newpage =
alloc_page(GFP_NOFS | __GFP_NOFAIL);
set_page_private(newpage, Z_EROFS_SHORTLIVED_PAGE);
- err = z_erofs_attach_page(clt, newpage,
+ err = z_erofs_attach_page(fe, newpage,
Z_EROFS_PAGE_TYPE_EXCLUSIVE, true);
if (!err)
goto retry;
@@ -773,7 +766,7 @@ retry:
/* bump up the number of spiltted parts of a page */
++spiltted;
/* also update nr_pages */
- clt->cl->nr_pages = max_t(pgoff_t, clt->cl->nr_pages, index + 1);
+ fe->cl->nr_pages = max_t(pgoff_t, fe->cl->nr_pages, index + 1);
next_part:
/* can be used for verification */
map->m_llen = offset + cur - map->m_la;
@@ -1073,12 +1066,9 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
/* wake up the caller thread for sync decompression */
if (sync) {
- unsigned long flags;
-
- spin_lock_irqsave(&io->u.wait.lock, flags);
if (!atomic_add_return(bios, &io->pending_bios))
- wake_up_locked(&io->u.wait);
- spin_unlock_irqrestore(&io->u.wait.lock, flags);
+ complete(&io->u.done);
+
return;
}
@@ -1098,10 +1088,10 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
unsigned int nr,
struct page **pagepool,
- struct address_space *mc,
- gfp_t gfp)
+ struct address_space *mc)
{
const pgoff_t index = pcl->obj.index;
+ gfp_t gfp = mapping_gfp_mask(mc);
bool tocache = false;
struct address_space *mapping;
@@ -1224,7 +1214,7 @@ jobqueue_init(struct super_block *sb,
} else {
fg_out:
q = fgq;
- init_waitqueue_head(&fgq->u.wait);
+ init_completion(&fgq->u.done);
atomic_set(&fgq->pending_bios, 0);
}
q->sb = sb;
@@ -1309,7 +1299,7 @@ static void z_erofs_submit_queue(struct super_block *sb,
z_erofs_next_pcluster_t qtail[NR_JOBQUEUES];
struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
void *bi_private;
- z_erofs_next_pcluster_t owned_head = f->clt.owned_head;
+ z_erofs_next_pcluster_t owned_head = f->owned_head;
/* bio is NULL initially, so no need to initialize last_{index,bdev} */
pgoff_t last_index;
struct block_device *last_bdev;
@@ -1357,8 +1347,7 @@ static void z_erofs_submit_queue(struct super_block *sb,
struct page *page;
page = pickup_page_for_submission(pcl, i++, pagepool,
- MNGD_MAPPING(sbi),
- GFP_NOFS);
+ MNGD_MAPPING(sbi));
if (!page)
continue;
@@ -1370,15 +1359,14 @@ submit_bio_retry:
}
if (!bio) {
- bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS);
+ bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
+ REQ_OP_READ, GFP_NOIO);
bio->bi_end_io = z_erofs_decompressqueue_endio;
- bio_set_dev(bio, mdev.m_bdev);
last_bdev = mdev.m_bdev;
bio->bi_iter.bi_sector = (sector_t)cur <<
LOG_SECTORS_PER_BLOCK;
bio->bi_private = bi_private;
- bio->bi_opf = REQ_OP_READ;
if (f->readahead)
bio->bi_opf |= REQ_RAHEAD;
++nr_bios;
@@ -1417,7 +1405,7 @@ static void z_erofs_runqueue(struct super_block *sb,
{
struct z_erofs_decompressqueue io[NR_JOBQUEUES];
- if (f->clt.owned_head == Z_EROFS_PCLUSTER_TAIL)
+ if (f->owned_head == Z_EROFS_PCLUSTER_TAIL)
return;
z_erofs_submit_queue(sb, f, pagepool, io, &force_fg);
@@ -1428,8 +1416,7 @@ static void z_erofs_runqueue(struct super_block *sb,
return;
/* wait until all bios are completed */
- io_wait_event(io[JQ_SUBMIT].u.wait,
- !atomic_read(&io[JQ_SUBMIT].pending_bios));
+ wait_for_completion_io(&io[JQ_SUBMIT].u.done);
/* handle synchronous decompress queue in the caller context */
z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool);
@@ -1517,7 +1504,7 @@ static int z_erofs_readpage(struct file *file, struct page *page)
err = z_erofs_do_read_page(&f, page, &pagepool);
z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false);
- (void)z_erofs_collector_end(&f.clt);
+ (void)z_erofs_collector_end(&f);
/* if some compressed cluster ready, need submit them anyway */
z_erofs_runqueue(inode->i_sb, &f, &pagepool,
@@ -1567,7 +1554,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
put_page(page);
}
z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false);
- (void)z_erofs_collector_end(&f.clt);
+ (void)z_erofs_collector_end(&f);
z_erofs_runqueue(inode->i_sb, &f, &pagepool,
z_erofs_get_sync_decompress_policy(sbi, nr_pages));
diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
index e043216b545f..800b11c53f57 100644
--- a/fs/erofs/zdata.h
+++ b/fs/erofs/zdata.h
@@ -97,7 +97,7 @@ struct z_erofs_decompressqueue {
z_erofs_next_pcluster_t head;
union {
- wait_queue_head_t wait;
+ struct completion done;
struct work_struct work;
} u;
};
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 361b1d6e4bf9..572f0b8151ba 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -431,48 +431,47 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
unsigned int lookback_distance)
{
struct erofs_inode *const vi = EROFS_I(m->inode);
- struct erofs_map_blocks *const map = m->map;
const unsigned int lclusterbits = vi->z_logical_clusterbits;
- unsigned long lcn = m->lcn;
- int err;
- if (lcn < lookback_distance) {
- erofs_err(m->inode->i_sb,
- "bogus lookback distance @ nid %llu", vi->nid);
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
+ while (m->lcn >= lookback_distance) {
+ unsigned long lcn = m->lcn - lookback_distance;
+ int err;
- /* load extent head logical cluster if needed */
- lcn -= lookback_distance;
- err = z_erofs_load_cluster_from_disk(m, lcn, false);
- if (err)
- return err;
+ /* load extent head logical cluster if needed */
+ err = z_erofs_load_cluster_from_disk(m, lcn, false);
+ if (err)
+ return err;
- switch (m->type) {
- case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
- if (!m->delta[0]) {
+ switch (m->type) {
+ case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+ if (!m->delta[0]) {
+ erofs_err(m->inode->i_sb,
+ "invalid lookback distance 0 @ nid %llu",
+ vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ lookback_distance = m->delta[0];
+ continue;
+ case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
+ m->headtype = m->type;
+ m->map->m_la = (lcn << lclusterbits) | m->clusterofs;
+ return 0;
+ default:
erofs_err(m->inode->i_sb,
- "invalid lookback distance 0 @ nid %llu",
- vi->nid);
+ "unknown type %u @ lcn %lu of nid %llu",
+ m->type, lcn, vi->nid);
DBG_BUGON(1);
- return -EFSCORRUPTED;
+ return -EOPNOTSUPP;
}
- return z_erofs_extent_lookback(m, m->delta[0]);
- case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
- m->headtype = m->type;
- map->m_la = (lcn << lclusterbits) | m->clusterofs;
- break;
- default:
- erofs_err(m->inode->i_sb,
- "unknown type %u @ lcn %lu of nid %llu",
- m->type, lcn, vi->nid);
- DBG_BUGON(1);
- return -EOPNOTSUPP;
}
- return 0;
+
+ erofs_err(m->inode->i_sb, "bogus lookback distance @ nid %llu",
+ vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
}
static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
@@ -494,7 +493,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
!(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) ||
((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) &&
!(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2))) {
- map->m_plen = 1 << lclusterbits;
+ map->m_plen = 1ULL << lclusterbits;
return 0;
}
lcn = m->lcn + 1;
@@ -540,7 +539,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
return -EFSCORRUPTED;
}
out:
- map->m_plen = m->compressedlcs << lclusterbits;
+ map->m_plen = (u64)m->compressedlcs << lclusterbits;
return 0;
err_bonus_cblkcnt:
erofs_err(m->inode->i_sb,
diff --git a/fs/exec.c b/fs/exec.c
index 79f2c9483302..e3e55d5e0be1 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -56,7 +56,6 @@
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/audit.h>
-#include <linux/tracehook.h>
#include <linux/kmod.h>
#include <linux/fsnotify.h>
#include <linux/fs_struct.h>
@@ -118,7 +117,7 @@ bool path_noexec(const struct path *path)
* Note that a shared library must be both readable and executable due to
* security reasons.
*
- * Also note that we take the address to load from from the file itself.
+ * Also note that we take the address to load from the file itself.
*/
SYSCALL_DEFINE1(uselib, const char __user *, library)
{
@@ -495,8 +494,14 @@ static int bprm_stack_limits(struct linux_binprm *bprm)
* the stack. They aren't stored until much later when we can't
* signal to the parent that the child has run out of stack space.
* Instead, calculate it here so it's possible to fail gracefully.
+ *
+ * In the case of argc = 0, make sure there is space for adding a
+ * empty string (which will bump argc to 1), to ensure confused
+ * userspace programs don't start processing from argv[1], thinking
+ * argc can never be 0, to keep them from walking envp by accident.
+ * See do_execveat_common().
*/
- ptr_size = (bprm->argc + bprm->envc) * sizeof(void *);
+ ptr_size = (max(bprm->argc, 1) + bprm->envc) * sizeof(void *);
if (limit <= ptr_size)
return -E2BIG;
limit -= ptr_size;
@@ -536,7 +541,7 @@ static int copy_strings(int argc, struct user_arg_ptr argv,
if (!valid_arg_len(bprm, len))
goto out;
- /* We're going to work our way backwords. */
+ /* We're going to work our way backwards. */
pos = bprm->p;
str += len;
bprm->p -= len;
@@ -1269,7 +1274,7 @@ int begin_new_exec(struct linux_binprm * bprm)
/*
* Must be called _before_ exec_mmap() as bprm->mm is
- * not visibile until then. This also enables the update
+ * not visible until then. This also enables the update
* to be lockless.
*/
retval = set_mm_exe_file(bprm->mm, bprm->file);
@@ -1303,12 +1308,6 @@ int begin_new_exec(struct linux_binprm * bprm)
if (retval)
goto out_unlock;
- /*
- * Ensure that the uaccess routines can actually operate on userspace
- * pointers:
- */
- force_uaccess_begin();
-
if (me->flags & PF_KTHREAD)
free_kthread_struct(me);
me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
@@ -1897,6 +1896,9 @@ static int do_execveat_common(int fd, struct filename *filename,
}
retval = count(argv, MAX_ARG_STRINGS);
+ if (retval == 0)
+ pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n",
+ current->comm, bprm->filename);
if (retval < 0)
goto out_free;
bprm->argc = retval;
@@ -1923,6 +1925,19 @@ static int do_execveat_common(int fd, struct filename *filename,
if (retval < 0)
goto out_free;
+ /*
+ * When argv is empty, add an empty string ("") as argv[0] to
+ * ensure confused userspace programs that start processing
+ * from argv[1] won't end up walking envp. See also
+ * bprm_stack_limits().
+ */
+ if (bprm->argc == 0) {
+ retval = copy_string_kernel("", bprm);
+ if (retval < 0)
+ goto out_free;
+ bprm->argc = 1;
+ }
+
retval = bprm_execve(bprm, fd, filename, flags);
out_free:
free_bprm(bprm);
@@ -1951,6 +1966,8 @@ int kernel_execve(const char *kernel_filename,
}
retval = count_strings_kernel(argv);
+ if (WARN_ON_ONCE(retval == 0))
+ retval = -EINVAL;
if (retval < 0)
goto out_free;
bprm->argc = retval;
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index 619e5b4bed10..c6800b880920 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -203,7 +203,8 @@ struct exfat_mount_options {
/* on error: continue, panic, remount-ro */
enum exfat_error_mode errors;
unsigned utf8:1, /* Use of UTF-8 character set */
- discard:1; /* Issue discard requests on deletions */
+ discard:1, /* Issue discard requests on deletions */
+ keep_last_dots:1; /* Keep trailing periods in paths */
int time_offset; /* Offset of timestamps from UTC (in minutes) */
};
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index d890fd34bb2d..2f5130059236 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -218,8 +218,6 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
if (exfat_free_cluster(inode, &clu))
return -EIO;
- exfat_clear_volume_dirty(sb);
-
return 0;
}
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index df805bd05508..fc0ea1684880 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -490,7 +490,8 @@ int exfat_block_truncate_page(struct inode *inode, loff_t from)
}
static const struct address_space_operations exfat_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = exfat_readpage,
.readahead = exfat_readahead,
.writepage = exfat_writepage,
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index af4eb39cc0c3..a02a04a993bf 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -65,11 +65,14 @@ static int exfat_d_revalidate(struct dentry *dentry, unsigned int flags)
return ret;
}
-/* returns the length of a struct qstr, ignoring trailing dots */
-static unsigned int exfat_striptail_len(unsigned int len, const char *name)
+/* returns the length of a struct qstr, ignoring trailing dots if necessary */
+static unsigned int exfat_striptail_len(unsigned int len, const char *name,
+ bool keep_last_dots)
{
- while (len && name[len - 1] == '.')
- len--;
+ if (!keep_last_dots) {
+ while (len && name[len - 1] == '.')
+ len--;
+ }
return len;
}
@@ -83,7 +86,8 @@ static int exfat_d_hash(const struct dentry *dentry, struct qstr *qstr)
struct super_block *sb = dentry->d_sb;
struct nls_table *t = EXFAT_SB(sb)->nls_io;
const unsigned char *name = qstr->name;
- unsigned int len = exfat_striptail_len(qstr->len, qstr->name);
+ unsigned int len = exfat_striptail_len(qstr->len, qstr->name,
+ EXFAT_SB(sb)->options.keep_last_dots);
unsigned long hash = init_name_hash(dentry);
int i, charlen;
wchar_t c;
@@ -104,8 +108,10 @@ static int exfat_d_cmp(const struct dentry *dentry, unsigned int len,
{
struct super_block *sb = dentry->d_sb;
struct nls_table *t = EXFAT_SB(sb)->nls_io;
- unsigned int alen = exfat_striptail_len(name->len, name->name);
- unsigned int blen = exfat_striptail_len(len, str);
+ unsigned int alen = exfat_striptail_len(name->len, name->name,
+ EXFAT_SB(sb)->options.keep_last_dots);
+ unsigned int blen = exfat_striptail_len(len, str,
+ EXFAT_SB(sb)->options.keep_last_dots);
wchar_t c1, c2;
int charlen, i;
@@ -136,7 +142,8 @@ static int exfat_utf8_d_hash(const struct dentry *dentry, struct qstr *qstr)
{
struct super_block *sb = dentry->d_sb;
const unsigned char *name = qstr->name;
- unsigned int len = exfat_striptail_len(qstr->len, qstr->name);
+ unsigned int len = exfat_striptail_len(qstr->len, qstr->name,
+ EXFAT_SB(sb)->options.keep_last_dots);
unsigned long hash = init_name_hash(dentry);
int i, charlen;
unicode_t u;
@@ -161,8 +168,11 @@ static int exfat_utf8_d_cmp(const struct dentry *dentry, unsigned int len,
const char *str, const struct qstr *name)
{
struct super_block *sb = dentry->d_sb;
- unsigned int alen = exfat_striptail_len(name->len, name->name);
- unsigned int blen = exfat_striptail_len(len, str);
+ unsigned int alen = exfat_striptail_len(name->len, name->name,
+ EXFAT_SB(sb)->options.keep_last_dots);
+ unsigned int blen = exfat_striptail_len(len, str,
+ EXFAT_SB(sb)->options.keep_last_dots);
+
unicode_t u_a, u_b;
int charlen, i;
@@ -416,13 +426,25 @@ static int __exfat_resolve_path(struct inode *inode, const unsigned char *path,
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(inode);
+ int pathlen = strlen(path);
- /* strip all trailing periods */
- namelen = exfat_striptail_len(strlen(path), path);
+ /*
+ * get the length of the pathname excluding
+ * trailing periods, if any.
+ */
+ namelen = exfat_striptail_len(pathlen, path, false);
+ if (EXFAT_SB(sb)->options.keep_last_dots) {
+ /*
+ * Do not allow the creation of files with names
+ * ending with period(s).
+ */
+ if (!lookup && (namelen < pathlen))
+ return -EINVAL;
+ namelen = pathlen;
+ }
if (!namelen)
return -ENOENT;
-
- if (strlen(path) > (MAX_NAME_LENGTH * MAX_CHARSET_SIZE))
+ if (pathlen > (MAX_NAME_LENGTH * MAX_CHARSET_SIZE))
return -ENAMETOOLONG;
/*
@@ -554,7 +576,6 @@ static int exfat_create(struct user_namespace *mnt_userns, struct inode *dir,
exfat_set_volume_dirty(sb);
err = exfat_add_entry(dir, dentry->d_name.name, &cdir, TYPE_FILE,
&info);
- exfat_clear_volume_dirty(sb);
if (err)
goto unlock;
@@ -812,7 +833,6 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry)
/* This doesn't modify ei */
ei->dir.dir = DIR_DELETED;
- exfat_clear_volume_dirty(sb);
inode_inc_iversion(dir);
dir->i_mtime = dir->i_atime = current_time(dir);
@@ -846,7 +866,6 @@ static int exfat_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
exfat_set_volume_dirty(sb);
err = exfat_add_entry(dir, dentry->d_name.name, &cdir, TYPE_DIR,
&info);
- exfat_clear_volume_dirty(sb);
if (err)
goto unlock;
@@ -976,7 +995,6 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
goto unlock;
}
ei->dir.dir = DIR_DELETED;
- exfat_clear_volume_dirty(sb);
inode_inc_iversion(dir);
dir->i_mtime = dir->i_atime = current_time(dir);
@@ -1311,7 +1329,6 @@ del_out:
*/
new_ei->dir.dir = DIR_DELETED;
}
- exfat_clear_volume_dirty(sb);
out:
return ret;
}
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index 8c9fb7dcec16..8ca21e7917d1 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -100,7 +100,6 @@ static int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flags)
{
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct boot_sector *p_boot = (struct boot_sector *)sbi->boot_bh->b_data;
- bool sync;
/* retain persistent-flags */
new_flags |= sbi->vol_flags_persistent;
@@ -119,16 +118,11 @@ static int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flags)
p_boot->vol_flags = cpu_to_le16(new_flags);
- if ((new_flags & VOLUME_DIRTY) && !buffer_dirty(sbi->boot_bh))
- sync = true;
- else
- sync = false;
-
set_buffer_uptodate(sbi->boot_bh);
mark_buffer_dirty(sbi->boot_bh);
- if (sync)
- sync_dirty_buffer(sbi->boot_bh);
+ __sync_dirty_buffer(sbi->boot_bh, REQ_SYNC | REQ_FUA | REQ_PREFLUSH);
+
return 0;
}
@@ -174,6 +168,8 @@ static int exfat_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",errors=remount-ro");
if (opts->discard)
seq_puts(m, ",discard");
+ if (opts->keep_last_dots)
+ seq_puts(m, ",keep_last_dots");
if (opts->time_offset)
seq_printf(m, ",time_offset=%d", opts->time_offset);
return 0;
@@ -183,7 +179,7 @@ static struct inode *exfat_alloc_inode(struct super_block *sb)
{
struct exfat_inode_info *ei;
- ei = kmem_cache_alloc(exfat_inode_cachep, GFP_NOFS);
+ ei = alloc_inode_sb(sb, exfat_inode_cachep, GFP_NOFS);
if (!ei)
return NULL;
@@ -217,6 +213,7 @@ enum {
Opt_charset,
Opt_errors,
Opt_discard,
+ Opt_keep_last_dots,
Opt_time_offset,
/* Deprecated options */
@@ -243,6 +240,7 @@ static const struct fs_parameter_spec exfat_parameters[] = {
fsparam_string("iocharset", Opt_charset),
fsparam_enum("errors", Opt_errors, exfat_param_enums),
fsparam_flag("discard", Opt_discard),
+ fsparam_flag("keep_last_dots", Opt_keep_last_dots),
fsparam_s32("time_offset", Opt_time_offset),
__fsparam(NULL, "utf8", Opt_utf8, fs_param_deprecated,
NULL),
@@ -297,6 +295,9 @@ static int exfat_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_discard:
opts->discard = 1;
break;
+ case Opt_keep_last_dots:
+ opts->keep_last_dots = 1;
+ break;
case Opt_time_offset:
/*
* Make the limit 24 just in case someone invents something
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index df14e750e9fe..998dd2ac8008 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -170,11 +170,6 @@ static void ext2_preread_inode(struct inode *inode)
unsigned long offset;
unsigned long block;
struct ext2_group_desc * gdp;
- struct backing_dev_info *bdi;
-
- bdi = inode_to_bdi(inode);
- if (bdi_rw_congested(bdi))
- return;
block_group = (inode->i_ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb);
gdp = ext2_get_group_desc(inode->i_sb, block_group, NULL);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 602578b72d8c..52377a0ee735 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -967,7 +967,8 @@ ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc
}
const struct address_space_operations ext2_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = ext2_readpage,
.readahead = ext2_readahead,
.writepage = ext2_writepage,
@@ -982,7 +983,8 @@ const struct address_space_operations ext2_aops = {
};
const struct address_space_operations ext2_nobh_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = ext2_readpage,
.readahead = ext2_readahead,
.writepage = ext2_nobh_writepage,
@@ -998,8 +1000,7 @@ const struct address_space_operations ext2_nobh_aops = {
static const struct address_space_operations ext2_dax_aops = {
.writepages = ext2_dax_writepages,
.direct_IO = noop_direct_IO,
- .set_page_dirty = __set_page_dirty_no_writeback,
- .invalidatepage = noop_invalidatepage,
+ .dirty_folio = noop_dirty_folio,
};
/*
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 94f1fbd7d3ac..f6a19f6d9f6d 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -180,7 +180,7 @@ static struct kmem_cache * ext2_inode_cachep;
static struct inode *ext2_alloc_inode(struct super_block *sb)
{
struct ext2_inode_info *ei;
- ei = kmem_cache_alloc(ext2_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, ext2_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
ei->i_block_alloc_info = NULL;
@@ -753,8 +753,12 @@ static loff_t ext2_max_size(int bits)
res += 1LL << (bits-2);
res += 1LL << (2*(bits-2));
res += 1LL << (3*(bits-2));
+ /* Compute how many metadata blocks are needed */
+ meta_blocks = 1;
+ meta_blocks += 1 + ppb;
+ meta_blocks += 1 + ppb + ppb * ppb;
/* Does block tree limit file size? */
- if (res < upper_limit)
+ if (res + meta_blocks <= upper_limit)
goto check_lfs;
res = upper_limit;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index a0fb0c4bdc7c..78ee3ef795ae 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -411,6 +411,7 @@ verified:
* ext4_read_block_bitmap_nowait()
* @sb: super block
* @block_group: given block group
+ * @ignore_locked: ignore locked buffers
*
* Read the bitmap for a given block_group,and validate the
* bits for block/inode/inode tables are set in the bitmaps
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 4666b55b736e..5504f72bbbbe 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -292,15 +292,10 @@ void ext4_release_system_zone(struct super_block *sb)
call_rcu(&system_blks->rcu, ext4_destroy_system_zone);
}
-/*
- * Returns 1 if the passed-in block region (start_blk,
- * start_blk+count) is valid; 0 if some part of the block region
- * overlaps with some other filesystem metadata blocks.
- */
-int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk,
- unsigned int count)
+int ext4_sb_block_valid(struct super_block *sb, struct inode *inode,
+ ext4_fsblk_t start_blk, unsigned int count)
{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_system_blocks *system_blks;
struct ext4_system_zone *entry;
struct rb_node *n;
@@ -329,7 +324,9 @@ int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk,
else if (start_blk >= (entry->start_blk + entry->count))
n = n->rb_right;
else {
- ret = (entry->ino == inode->i_ino);
+ ret = 0;
+ if (inode)
+ ret = (entry->ino == inode->i_ino);
break;
}
}
@@ -338,6 +335,17 @@ out_rcu:
return ret;
}
+/*
+ * Returns 1 if the passed-in block region (start_blk,
+ * start_blk+count) is valid; 0 if some part of the block region
+ * overlaps with some other filesystem metadata blocks.
+ */
+int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk,
+ unsigned int count)
+{
+ return ext4_sb_block_valid(inode->i_sb, inode, start_blk, count);
+}
+
int ext4_check_blockref(const char *function, unsigned int line,
struct inode *inode, __le32 *p, unsigned int max)
{
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bcd3b9bf8069..a743b1e3b89e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1046,6 +1046,8 @@ struct ext4_inode_info {
/* Fast commit related info */
+ /* For tracking dentry create updates */
+ struct list_head i_fc_dilist;
struct list_head i_fc_list; /*
* inodes that need fast commit
* protected by sbi->s_fc_lock.
@@ -1279,7 +1281,7 @@ struct ext4_inode_info {
#define ext4_find_next_zero_bit find_next_zero_bit_le
#define ext4_find_next_bit find_next_bit_le
-extern void ext4_set_bits(void *bm, int cur, int len);
+extern void mb_set_bits(void *bm, int cur, int len);
/*
* Maximal mount counts between two filesystem checks
@@ -2271,6 +2273,10 @@ static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi)
* Structure of a directory entry
*/
#define EXT4_NAME_LEN 255
+/*
+ * Base length of the ext4 directory entry excluding the name length
+ */
+#define EXT4_BASE_DIR_LEN (sizeof(struct ext4_dir_entry_2) - EXT4_NAME_LEN)
struct ext4_dir_entry {
__le32 inode; /* Inode number */
@@ -3030,7 +3036,7 @@ extern int ext4_inode_attach_jinode(struct inode *inode);
extern int ext4_can_truncate(struct inode *inode);
extern int ext4_truncate(struct inode *);
extern int ext4_break_layouts(struct inode *);
-extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
+extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
extern void ext4_set_inode_flags(struct inode *, bool init);
extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
@@ -3062,6 +3068,7 @@ int ext4_fileattr_set(struct user_namespace *mnt_userns,
struct dentry *dentry, struct fileattr *fa);
int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa);
extern void ext4_reset_inode_seed(struct inode *inode);
+int ext4_update_overhead(struct super_block *sb);
/* migrate.c */
extern int ext4_ext_migrate(struct inode *);
@@ -3707,6 +3714,9 @@ extern int ext4_inode_block_valid(struct inode *inode,
unsigned int count);
extern int ext4_check_blockref(const char *, unsigned int,
struct inode *, __le32 *, unsigned int);
+extern int ext4_sb_block_valid(struct super_block *sb, struct inode *inode,
+ ext4_fsblk_t start_blk, unsigned int count);
+
/* extents.c */
struct ext4_ext_path;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c0f3f83e0c1b..e473fde6b64b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3368,7 +3368,6 @@ static int ext4_split_extent(handle_t *handle,
return -EFSCORRUPTED;
}
unwritten = ext4_ext_is_unwritten(ex);
- split_flag1 = 0;
if (map->m_lblk >= ee_block) {
split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
@@ -4501,9 +4500,9 @@ retry:
return ret > 0 ? ret2 : ret;
}
-static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
+static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len);
-static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
+static int ext4_insert_range(struct file *file, loff_t offset, loff_t len);
static long ext4_zero_range(struct file *file, loff_t offset,
loff_t len, int mode)
@@ -4575,6 +4574,10 @@ static long ext4_zero_range(struct file *file, loff_t offset,
/* Wait all existing dio workers, newcomers will block on i_rwsem */
inode_dio_wait(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out_mutex;
+
/* Preallocate the range including the unaligned edges */
if (partial_begin || partial_end) {
ret = ext4_alloc_file_blocks(file,
@@ -4691,7 +4694,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
return -EOPNOTSUPP;
if (mode & FALLOC_FL_PUNCH_HOLE) {
- ret = ext4_punch_hole(inode, offset, len);
+ ret = ext4_punch_hole(file, offset, len);
goto exit;
}
@@ -4700,12 +4703,12 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
goto exit;
if (mode & FALLOC_FL_COLLAPSE_RANGE) {
- ret = ext4_collapse_range(inode, offset, len);
+ ret = ext4_collapse_range(file, offset, len);
goto exit;
}
if (mode & FALLOC_FL_INSERT_RANGE) {
- ret = ext4_insert_range(inode, offset, len);
+ ret = ext4_insert_range(file, offset, len);
goto exit;
}
@@ -4741,6 +4744,10 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
/* Wait all existing dio workers, newcomers will block on i_rwsem */
inode_dio_wait(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out;
+
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
if (ret)
goto out;
@@ -5242,8 +5249,9 @@ out:
* This implements the fallocate's collapse range functionality for ext4
* Returns: 0 and non-zero on error.
*/
-static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
{
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
struct address_space *mapping = inode->i_mapping;
ext4_lblk_t punch_start, punch_stop;
@@ -5295,6 +5303,10 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
/* Wait for existing dio to complete */
inode_dio_wait(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out_mutex;
+
/*
* Prevent page faults from reinstantiating pages we have released from
* page cache.
@@ -5388,8 +5400,9 @@ out_mutex:
* by len bytes.
* Returns 0 on success, error otherwise.
*/
-static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
+static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
{
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
struct address_space *mapping = inode->i_mapping;
handle_t *handle;
@@ -5446,6 +5459,10 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
/* Wait for existing dio to complete */
inode_dio_wait(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out_mutex;
+
/*
* Prevent page faults from reinstantiating pages we have released from
* page cache.
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 7964ee34e322..3d72565ec6e8 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -199,6 +199,7 @@ void ext4_fc_init_inode(struct inode *inode)
ext4_fc_reset_inode(inode);
ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
INIT_LIST_HEAD(&ei->i_fc_list);
+ INIT_LIST_HEAD(&ei->i_fc_dilist);
init_waitqueue_head(&ei->i_fc_wait);
atomic_set(&ei->i_fc_updates, 0);
}
@@ -279,6 +280,8 @@ void ext4_fc_stop_update(struct inode *inode)
void ext4_fc_del(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_fc_dentry_update *fc_dentry;
if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
@@ -286,7 +289,7 @@ void ext4_fc_del(struct inode *inode)
restart:
spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
- if (list_empty(&ei->i_fc_list)) {
+ if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
return;
}
@@ -295,8 +298,33 @@ restart:
ext4_fc_wait_committing_inode(inode);
goto restart;
}
- list_del_init(&ei->i_fc_list);
- spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
+
+ if (!list_empty(&ei->i_fc_list))
+ list_del_init(&ei->i_fc_list);
+
+ /*
+ * Since this inode is getting removed, let's also remove all FC
+ * dentry create references, since it is not needed to log it anyways.
+ */
+ if (list_empty(&ei->i_fc_dilist)) {
+ spin_unlock(&sbi->s_fc_lock);
+ return;
+ }
+
+ fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist);
+ WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT);
+ list_del_init(&fc_dentry->fcd_list);
+ list_del_init(&fc_dentry->fcd_dilist);
+
+ WARN_ON(!list_empty(&ei->i_fc_dilist));
+ spin_unlock(&sbi->s_fc_lock);
+
+ if (fc_dentry->fcd_name.name &&
+ fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
+ kfree(fc_dentry->fcd_name.name);
+ kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
+
+ return;
}
/*
@@ -351,13 +379,6 @@ static int ext4_fc_track_template(
tid_t tid = 0;
int ret;
- if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
- (sbi->s_mount_state & EXT4_FC_REPLAY))
- return -EOPNOTSUPP;
-
- if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
- return -EINVAL;
-
tid = handle->h_transaction->t_tid;
mutex_lock(&ei->i_fc_lock);
if (tid == ei->i_sync_tid) {
@@ -427,7 +448,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
node->fcd_name.name = node->fcd_iname;
}
node->fcd_name.len = dentry->d_name.len;
-
+ INIT_LIST_HEAD(&node->fcd_dilist);
spin_lock(&sbi->s_fc_lock);
if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
@@ -435,6 +456,20 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
&sbi->s_fc_dentry_q[FC_Q_STAGING]);
else
list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
+
+ /*
+ * This helps us keep a track of all fc_dentry updates which is part of
+ * this ext4 inode. So in case the inode is getting unlinked, before
+ * even we get a chance to fsync, we could remove all fc_dentry
+ * references while evicting the inode in ext4_fc_del().
+ * Also with this, we don't need to loop over all the inodes in
+ * sbi->s_fc_q to get the corresponding inode in
+ * ext4_fc_commit_dentry_updates().
+ */
+ if (dentry_update->op == EXT4_FC_TAG_CREAT) {
+ WARN_ON(!list_empty(&ei->i_fc_dilist));
+ list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
+ }
spin_unlock(&sbi->s_fc_lock);
mutex_lock(&ei->i_fc_lock);
@@ -452,12 +487,22 @@ void __ext4_fc_track_unlink(handle_t *handle,
ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
(void *)&args, 0);
- trace_ext4_fc_track_unlink(inode, dentry, ret);
+ trace_ext4_fc_track_unlink(handle, inode, dentry, ret);
}
void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
{
- __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
+ struct inode *inode = d_inode(dentry);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
+ (sbi->s_mount_state & EXT4_FC_REPLAY))
+ return;
+
+ if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
+ return;
+
+ __ext4_fc_track_unlink(handle, inode, dentry);
}
void __ext4_fc_track_link(handle_t *handle,
@@ -471,12 +516,22 @@ void __ext4_fc_track_link(handle_t *handle,
ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
(void *)&args, 0);
- trace_ext4_fc_track_link(inode, dentry, ret);
+ trace_ext4_fc_track_link(handle, inode, dentry, ret);
}
void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
{
- __ext4_fc_track_link(handle, d_inode(dentry), dentry);
+ struct inode *inode = d_inode(dentry);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
+ (sbi->s_mount_state & EXT4_FC_REPLAY))
+ return;
+
+ if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
+ return;
+
+ __ext4_fc_track_link(handle, inode, dentry);
}
void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
@@ -490,12 +545,22 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
(void *)&args, 0);
- trace_ext4_fc_track_create(inode, dentry, ret);
+ trace_ext4_fc_track_create(handle, inode, dentry, ret);
}
void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
{
- __ext4_fc_track_create(handle, d_inode(dentry), dentry);
+ struct inode *inode = d_inode(dentry);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
+ (sbi->s_mount_state & EXT4_FC_REPLAY))
+ return;
+
+ if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
+ return;
+
+ __ext4_fc_track_create(handle, inode, dentry);
}
/* __track_fn for inode tracking */
@@ -511,6 +576,7 @@ static int __track_inode(struct inode *inode, void *arg, bool update)
void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int ret;
if (S_ISDIR(inode->i_mode))
@@ -522,8 +588,15 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
return;
}
+ if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
+ (sbi->s_mount_state & EXT4_FC_REPLAY))
+ return;
+
+ if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
+ return;
+
ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
- trace_ext4_fc_track_inode(inode, ret);
+ trace_ext4_fc_track_inode(handle, inode, ret);
}
struct __track_range_args {
@@ -561,18 +634,26 @@ static int __track_range(struct inode *inode, void *arg, bool update)
void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
ext4_lblk_t end)
{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct __track_range_args args;
int ret;
if (S_ISDIR(inode->i_mode))
return;
+ if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
+ (sbi->s_mount_state & EXT4_FC_REPLAY))
+ return;
+
+ if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
+ return;
+
args.start = start;
args.end = end;
ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1);
- trace_ext4_fc_track_range(inode, start, end, ret);
+ trace_ext4_fc_track_range(handle, inode, start, end, ret);
}
static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
@@ -954,7 +1035,7 @@ __releases(&sbi->s_fc_lock)
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
struct inode *inode;
- struct ext4_inode_info *ei, *ei_n;
+ struct ext4_inode_info *ei;
int ret;
if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
@@ -970,21 +1051,16 @@ __releases(&sbi->s_fc_lock)
spin_lock(&sbi->s_fc_lock);
continue;
}
-
- inode = NULL;
- list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN],
- i_fc_list) {
- if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
- inode = &ei->vfs_inode;
- break;
- }
- }
/*
- * If we don't find inode in our list, then it was deleted,
- * in which case, we don't need to record it's create tag.
+ * With fcd_dilist we need not loop in sbi->s_fc_q to get the
+ * corresponding inode pointer
*/
- if (!inode)
- continue;
+ WARN_ON(list_empty(&fc_dentry->fcd_dilist));
+ ei = list_first_entry(&fc_dentry->fcd_dilist,
+ struct ext4_inode_info, i_fc_dilist);
+ inode = &ei->vfs_inode;
+ WARN_ON(inode->i_ino != fc_dentry->fcd_ino);
+
spin_unlock(&sbi->s_fc_lock);
/*
@@ -1088,11 +1164,12 @@ out:
}
static void ext4_fc_update_stats(struct super_block *sb, int status,
- u64 commit_time, int nblks)
+ u64 commit_time, int nblks, tid_t commit_tid)
{
struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
- jbd_debug(1, "Fast commit ended with status = %d", status);
+ jbd_debug(1, "Fast commit ended with status = %d for tid %u",
+ status, commit_tid);
if (status == EXT4_FC_STATUS_OK) {
stats->fc_num_commits++;
stats->fc_numblks += nblks;
@@ -1110,7 +1187,7 @@ static void ext4_fc_update_stats(struct super_block *sb, int status,
} else {
stats->fc_skipped_commits++;
}
- trace_ext4_fc_commit_stop(sb, nblks, status);
+ trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid);
}
/*
@@ -1128,13 +1205,13 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
ktime_t start_time, commit_time;
- trace_ext4_fc_commit_start(sb);
-
- start_time = ktime_get();
-
if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
return jbd2_complete_transaction(journal, commit_tid);
+ trace_ext4_fc_commit_start(sb, commit_tid);
+
+ start_time = ktime_get();
+
restart_fc:
ret = jbd2_fc_begin_commit(journal, commit_tid);
if (ret == -EALREADY) {
@@ -1142,14 +1219,16 @@ restart_fc:
if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
commit_tid > journal->j_commit_sequence)
goto restart_fc;
- ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0);
+ ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
+ commit_tid);
return 0;
} else if (ret) {
/*
* Commit couldn't start. Just update stats and perform a
* full commit.
*/
- ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0);
+ ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0,
+ commit_tid);
return jbd2_complete_transaction(journal, commit_tid);
}
@@ -1181,12 +1260,12 @@ restart_fc:
* don't react too strongly to vast changes in the commit time
*/
commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
- ext4_fc_update_stats(sb, status, commit_time, nblks);
+ ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid);
return ret;
fallback:
ret = jbd2_fc_end_commit_fallback(journal);
- ext4_fc_update_stats(sb, status, 0, 0);
+ ext4_fc_update_stats(sb, status, 0, 0, commit_tid);
return ret;
}
@@ -1204,6 +1283,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
if (full && sbi->s_fc_bh)
sbi->s_fc_bh = NULL;
+ trace_ext4_fc_cleanup(journal, full, tid);
jbd2_fc_release_bufs(journal);
spin_lock(&sbi->s_fc_lock);
@@ -1228,6 +1308,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
struct ext4_fc_dentry_update,
fcd_list);
list_del_init(&fc_dentry->fcd_list);
+ list_del_init(&fc_dentry->fcd_dilist);
spin_unlock(&sbi->s_fc_lock);
if (fc_dentry->fcd_name.name &&
@@ -1875,8 +1956,8 @@ bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
if (state->fc_regions[i].ino == 0 ||
state->fc_regions[i].len == 0)
continue;
- if (blk >= state->fc_regions[i].pblk &&
- blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
+ if (in_range(blk, state->fc_regions[i].pblk,
+ state->fc_regions[i].len))
return true;
}
return false;
diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h
index 083ad1cb705a..1db12847a83b 100644
--- a/fs/ext4/fast_commit.h
+++ b/fs/ext4/fast_commit.h
@@ -55,13 +55,13 @@ struct ext4_fc_del_range {
struct ext4_fc_dentry_info {
__le32 fc_parent_ino;
__le32 fc_ino;
- __u8 fc_dname[0];
+ __u8 fc_dname[];
};
/* Value structure for EXT4_FC_TAG_INODE and EXT4_FC_TAG_INODE_PARTIAL. */
struct ext4_fc_inode {
__le32 fc_ino;
- __u8 fc_raw_inode[0];
+ __u8 fc_raw_inode[];
};
/* Value structure for tag EXT4_FC_TAG_TAIL. */
@@ -93,7 +93,6 @@ enum {
EXT4_FC_REASON_RENAME_DIR,
EXT4_FC_REASON_FALLOC_RANGE,
EXT4_FC_REASON_INODE_JOURNAL_DATA,
- EXT4_FC_COMMIT_FAILED,
EXT4_FC_REASON_MAX
};
@@ -109,6 +108,7 @@ struct ext4_fc_dentry_update {
struct qstr fcd_name; /* Dirent name */
unsigned char fcd_iname[DNAME_INLINE_LEN]; /* Dirent name string */
struct list_head fcd_list;
+ struct list_head fcd_dilist;
};
struct ext4_fc_stats {
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 8cc11715518a..6feb07e3e1eb 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -36,9 +36,11 @@
#include "acl.h"
#include "truncate.h"
-static bool ext4_dio_supported(struct inode *inode)
+static bool ext4_dio_supported(struct kiocb *iocb, struct iov_iter *iter)
{
- if (IS_ENABLED(CONFIG_FS_ENCRYPTION) && IS_ENCRYPTED(inode))
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ if (!fscrypt_dio_supported(iocb, iter))
return false;
if (fsverity_active(inode))
return false;
@@ -61,7 +63,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
inode_lock_shared(inode);
}
- if (!ext4_dio_supported(inode)) {
+ if (!ext4_dio_supported(iocb, to)) {
inode_unlock_shared(inode);
/*
* Fallback to buffered I/O if the operation being performed on
@@ -265,7 +267,7 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
goto out;
current->backing_dev_info = inode_to_bdi(inode);
- ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);
+ ret = generic_perform_write(iocb, from);
current->backing_dev_info = NULL;
out:
@@ -509,7 +511,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
/* Fallback to buffered I/O if the inode does not support direct I/O. */
- if (!ext4_dio_supported(inode)) {
+ if (!ext4_dio_supported(iocb, from)) {
if (ilock_shared)
inode_unlock_shared(inode);
else
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index e42941803605..9c076262770d 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1783,19 +1783,20 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data)
void *inline_pos;
unsigned int offset;
struct ext4_dir_entry_2 *de;
- bool ret = true;
+ bool ret = false;
err = ext4_get_inode_loc(dir, &iloc);
if (err) {
EXT4_ERROR_INODE_ERR(dir, -err,
"error %d getting inode %lu block",
err, dir->i_ino);
- return true;
+ return false;
}
down_read(&EXT4_I(dir)->xattr_sem);
if (!ext4_has_inline_data(dir)) {
*has_inline_data = 0;
+ ret = true;
goto out;
}
@@ -1804,7 +1805,6 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data)
ext4_warning(dir->i_sb,
"bad inline directory (dir #%lu) - no `..'",
dir->i_ino);
- ret = true;
goto out;
}
@@ -1823,16 +1823,15 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data)
dir->i_ino, le32_to_cpu(de->inode),
le16_to_cpu(de->rec_len), de->name_len,
inline_size);
- ret = true;
goto out;
}
if (le32_to_cpu(de->inode)) {
- ret = false;
goto out;
}
offset += ext4_rec_len_from_disk(de->rec_len, inline_size);
}
+ ret = true;
out:
up_read(&EXT4_I(dir)->xattr_sem);
brelse(iloc.bh);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 01c9e4f743ba..646ece9b3455 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -137,8 +137,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
new_size);
}
-static void ext4_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length);
static int __ext4_journalled_writepage(struct page *page, unsigned int len);
static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
int pextents);
@@ -186,7 +184,7 @@ void ext4_evict_inode(struct inode *inode)
* journal. So although mm thinks everything is clean and
* ready for reaping the inode might still have some pages to
* write in the running transaction or waiting to be
- * checkpointed. Thus calling jbd2_journal_invalidatepage()
+ * checkpointed. Thus calling jbd2_journal_invalidate_folio()
* (via truncate_inode_pages()) to discard these buffers can
* cause data loss. Also even if we did not discard these
* buffers, we would have no way to find them after the inode
@@ -1571,16 +1569,18 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
break;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
+ struct folio *folio = page_folio(page);
- BUG_ON(!PageLocked(page));
- BUG_ON(PageWriteback(page));
+ BUG_ON(!folio_test_locked(folio));
+ BUG_ON(folio_test_writeback(folio));
if (invalidate) {
- if (page_mapped(page))
- clear_page_dirty_for_io(page);
- block_invalidatepage(page, 0, PAGE_SIZE);
- ClearPageUptodate(page);
+ if (folio_mapped(folio))
+ folio_clear_dirty_for_io(folio);
+ block_invalidate_folio(folio, 0,
+ folio_size(folio));
+ folio_clear_uptodate(folio);
}
- unlock_page(page);
+ folio_unlock(folio);
}
pagevec_release(&pvec);
}
@@ -1971,6 +1971,7 @@ out_no_pagelock:
static int ext4_writepage(struct page *page,
struct writeback_control *wbc)
{
+ struct folio *folio = page_folio(page);
int ret = 0;
loff_t size;
unsigned int len;
@@ -1980,8 +1981,8 @@ static int ext4_writepage(struct page *page,
bool keep_towrite = false;
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) {
- inode->i_mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
- unlock_page(page);
+ folio_invalidate(folio, 0, folio_size(folio));
+ folio_unlock(folio);
return -EIO;
}
@@ -1993,6 +1994,15 @@ static int ext4_writepage(struct page *page,
else
len = PAGE_SIZE;
+ /* Should never happen but for bugs in other kernel subsystems */
+ if (!page_has_buffers(page)) {
+ ext4_warning_inode(inode,
+ "page %lu does not have buffers attached", page->index);
+ ClearPageDirty(page);
+ unlock_page(page);
+ return 0;
+ }
+
page_bufs = page_buffers(page);
/*
* We cannot do block allocation or other extent handling in this
@@ -2594,6 +2604,22 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
wait_on_page_writeback(page);
BUG_ON(PageWriteback(page));
+ /*
+ * Should never happen but for buggy code in
+ * other subsystems that call
+ * set_page_dirty() without properly warning
+ * the file system first. See [1] for more
+ * information.
+ *
+ * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz
+ */
+ if (!page_has_buffers(page)) {
+ ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", page->index);
+ ClearPageDirty(page);
+ unlock_page(page);
+ continue;
+ }
+
if (mpd->map.m_len == 0)
mpd->first_page = page->index;
mpd->next_page = page->index + 1;
@@ -3182,40 +3208,39 @@ static void ext4_readahead(struct readahead_control *rac)
ext4_mpage_readpages(inode, rac, NULL);
}
-static void ext4_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void ext4_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- trace_ext4_invalidatepage(page, offset, length);
+ trace_ext4_invalidate_folio(folio, offset, length);
/* No journalling happens on data buffers when this function is used */
- WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
+ WARN_ON(folio_buffers(folio) && buffer_jbd(folio_buffers(folio)));
- block_invalidatepage(page, offset, length);
+ block_invalidate_folio(folio, offset, length);
}
-static int __ext4_journalled_invalidatepage(struct page *page,
- unsigned int offset,
- unsigned int length)
+static int __ext4_journalled_invalidate_folio(struct folio *folio,
+ size_t offset, size_t length)
{
- journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+ journal_t *journal = EXT4_JOURNAL(folio->mapping->host);
- trace_ext4_journalled_invalidatepage(page, offset, length);
+ trace_ext4_journalled_invalidate_folio(folio, offset, length);
/*
* If it's a full truncate we just forget about the pending dirtying
*/
- if (offset == 0 && length == PAGE_SIZE)
- ClearPageChecked(page);
+ if (offset == 0 && length == folio_size(folio))
+ folio_clear_checked(folio);
- return jbd2_journal_invalidatepage(journal, page, offset, length);
+ return jbd2_journal_invalidate_folio(journal, folio, offset, length);
}
/* Wrapper for aops... */
-static void ext4_journalled_invalidatepage(struct page *page,
- unsigned int offset,
- unsigned int length)
+static void ext4_journalled_invalidate_folio(struct folio *folio,
+ size_t offset,
+ size_t length)
{
- WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0);
+ WARN_ON(__ext4_journalled_invalidate_folio(folio, offset, length) < 0);
}
static int ext4_releasepage(struct page *page, gfp_t wait)
@@ -3409,6 +3434,13 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
if (ret < 0)
return ret;
out:
+ /*
+ * When inline encryption is enabled, sometimes I/O to an encrypted file
+ * has to be broken up to guarantee DUN contiguity. Handle this by
+ * limiting the length of the mapping returned.
+ */
+ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len);
+
ext4_set_iomap(inode, iomap, &map, offset, length, flags);
return 0;
@@ -3541,29 +3573,32 @@ const struct iomap_ops ext4_iomap_report_ops = {
};
/*
- * Pages can be marked dirty completely asynchronously from ext4's journalling
- * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
- * much here because ->set_page_dirty is called under VFS locks. The page is
- * not necessarily locked.
+ * Whenever the folio is being dirtied, corresponding buffers should already
+ * be attached to the transaction (we take care of this in ext4_page_mkwrite()
+ * and ext4_write_begin()). However we cannot move buffers to dirty transaction
+ * lists here because ->dirty_folio is called under VFS locks and the folio
+ * is not necessarily locked.
*
- * We cannot just dirty the page and leave attached buffers clean, because the
+ * We cannot just dirty the folio and leave attached buffers clean, because the
* buffers' dirty state is "definitive". We cannot just set the buffers dirty
* or jbddirty because all the journalling code will explode.
*
- * So what we do is to mark the page "pending dirty" and next time writepage
+ * So what we do is to mark the folio "pending dirty" and next time writepage
* is called, propagate that into the buffers appropriately.
*/
-static int ext4_journalled_set_page_dirty(struct page *page)
+static bool ext4_journalled_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
{
- SetPageChecked(page);
- return __set_page_dirty_nobuffers(page);
+ WARN_ON_ONCE(!folio_buffers(folio));
+ folio_set_checked(folio);
+ return filemap_dirty_folio(mapping, folio);
}
-static int ext4_set_page_dirty(struct page *page)
+static bool ext4_dirty_folio(struct address_space *mapping, struct folio *folio)
{
- WARN_ON_ONCE(!PageLocked(page) && !PageDirty(page));
- WARN_ON_ONCE(!page_has_buffers(page));
- return __set_page_dirty_buffers(page);
+ WARN_ON_ONCE(!folio_test_locked(folio) && !folio_test_dirty(folio));
+ WARN_ON_ONCE(!folio_buffers(folio));
+ return block_dirty_folio(mapping, folio);
}
static int ext4_iomap_swap_activate(struct swap_info_struct *sis,
@@ -3580,9 +3615,9 @@ static const struct address_space_operations ext4_aops = {
.writepages = ext4_writepages,
.write_begin = ext4_write_begin,
.write_end = ext4_write_end,
- .set_page_dirty = ext4_set_page_dirty,
+ .dirty_folio = ext4_dirty_folio,
.bmap = ext4_bmap,
- .invalidatepage = ext4_invalidatepage,
+ .invalidate_folio = ext4_invalidate_folio,
.releasepage = ext4_releasepage,
.direct_IO = noop_direct_IO,
.migratepage = buffer_migrate_page,
@@ -3598,9 +3633,9 @@ static const struct address_space_operations ext4_journalled_aops = {
.writepages = ext4_writepages,
.write_begin = ext4_write_begin,
.write_end = ext4_journalled_write_end,
- .set_page_dirty = ext4_journalled_set_page_dirty,
+ .dirty_folio = ext4_journalled_dirty_folio,
.bmap = ext4_bmap,
- .invalidatepage = ext4_journalled_invalidatepage,
+ .invalidate_folio = ext4_journalled_invalidate_folio,
.releasepage = ext4_releasepage,
.direct_IO = noop_direct_IO,
.is_partially_uptodate = block_is_partially_uptodate,
@@ -3615,9 +3650,9 @@ static const struct address_space_operations ext4_da_aops = {
.writepages = ext4_writepages,
.write_begin = ext4_da_write_begin,
.write_end = ext4_da_write_end,
- .set_page_dirty = ext4_set_page_dirty,
+ .dirty_folio = ext4_dirty_folio,
.bmap = ext4_bmap,
- .invalidatepage = ext4_invalidatepage,
+ .invalidate_folio = ext4_invalidate_folio,
.releasepage = ext4_releasepage,
.direct_IO = noop_direct_IO,
.migratepage = buffer_migrate_page,
@@ -3629,9 +3664,8 @@ static const struct address_space_operations ext4_da_aops = {
static const struct address_space_operations ext4_dax_aops = {
.writepages = ext4_dax_writepages,
.direct_IO = noop_direct_IO,
- .set_page_dirty = __set_page_dirty_no_writeback,
+ .dirty_folio = noop_dirty_folio,
.bmap = ext4_bmap,
- .invalidatepage = noop_invalidatepage,
.swap_activate = ext4_iomap_swap_activate,
};
@@ -3919,12 +3953,14 @@ int ext4_break_layouts(struct inode *inode)
* Returns: 0 on success or negative on failure
*/
-int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
+int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
{
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
ext4_lblk_t first_block, stop_block;
struct address_space *mapping = inode->i_mapping;
- loff_t first_block_offset, last_block_offset;
+ loff_t first_block_offset, last_block_offset, max_length;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
handle_t *handle;
unsigned int credits;
int ret = 0, ret2 = 0;
@@ -3967,6 +4003,14 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
offset;
}
+ /*
+ * For punch hole the length + offset needs to be within one block
+ * before last range. Adjust the length if it goes beyond that limit.
+ */
+ max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize;
+ if (offset + length > max_length)
+ length = max_length - offset;
+
if (offset & (sb->s_blocksize - 1) ||
(offset + length) & (sb->s_blocksize - 1)) {
/*
@@ -3982,6 +4026,10 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
/* Wait all existing dio workers, newcomers will block on i_rwsem */
inode_dio_wait(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out_mutex;
+
/*
* Prevent page faults from reinstantiating pages we have released from
* page cache.
@@ -5204,13 +5252,12 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
}
/*
- * In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate
- * buffers that are attached to a page stradding i_size and are undergoing
+ * In data=journal mode ext4_journalled_invalidate_folio() may fail to invalidate
+ * buffers that are attached to a folio straddling i_size and are undergoing
* commit. In that case we have to wait for commit to finish and try again.
*/
static void ext4_wait_for_tail_page_commit(struct inode *inode)
{
- struct page *page;
unsigned offset;
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
tid_t commit_tid = 0;
@@ -5218,25 +5265,25 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
offset = inode->i_size & (PAGE_SIZE - 1);
/*
- * If the page is fully truncated, we don't need to wait for any commit
- * (and we even should not as __ext4_journalled_invalidatepage() may
- * strip all buffers from the page but keep the page dirty which can then
- * confuse e.g. concurrent ext4_writepage() seeing dirty page without
+ * If the folio is fully truncated, we don't need to wait for any commit
+ * (and we even should not as __ext4_journalled_invalidate_folio() may
+ * strip all buffers from the folio but keep the folio dirty which can then
+ * confuse e.g. concurrent ext4_writepage() seeing dirty folio without
* buffers). Also we don't need to wait for any commit if all buffers in
- * the page remain valid. This is most beneficial for the common case of
+ * the folio remain valid. This is most beneficial for the common case of
* blocksize == PAGESIZE.
*/
if (!offset || offset > (PAGE_SIZE - i_blocksize(inode)))
return;
while (1) {
- page = find_lock_page(inode->i_mapping,
+ struct folio *folio = filemap_lock_folio(inode->i_mapping,
inode->i_size >> PAGE_SHIFT);
- if (!page)
+ if (!folio)
return;
- ret = __ext4_journalled_invalidatepage(page, offset,
- PAGE_SIZE - offset);
- unlock_page(page);
- put_page(page);
+ ret = __ext4_journalled_invalidate_folio(folio, offset,
+ folio_size(folio) - offset);
+ folio_unlock(folio);
+ folio_put(folio);
if (ret != -EBUSY)
return;
commit_tid = 0;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index a8022c2c6a58..ba44fa1be70a 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -269,7 +269,7 @@ out:
return err ? err : 0;
}
-/**
+/*
* Swap memory between @a and @b for @len bytes.
*
* @a: pointer to first memory area
@@ -290,7 +290,7 @@ static void memswap(void *a, void *b, size_t len)
}
}
-/**
+/*
* Swap i_data and associated attributes between @inode1 and @inode2.
* This function is used for the primary swap between inode1 and inode2
* and also to revert this primary swap in case of errors.
@@ -344,7 +344,7 @@ void ext4_reset_inode_seed(struct inode *inode)
ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen));
}
-/**
+/*
* Swap the information from the given @inode and the inode
* EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other
* important fields of the inodes.
@@ -1652,3 +1652,19 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return ext4_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
}
#endif
+
+static void set_overhead(struct ext4_super_block *es, const void *arg)
+{
+ es->s_overhead_clusters = cpu_to_le32(*((unsigned long *) arg));
+}
+
+int ext4_update_overhead(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (sb_rdonly(sb) || sbi->s_overhead == 0 ||
+ sbi->s_overhead == le32_to_cpu(sbi->s_es->s_overhead_clusters))
+ return 0;
+
+ return ext4_update_superblocks_fn(sb, set_overhead, &sbi->s_overhead);
+}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 67ac95c4cd9b..252c168454c7 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1000,7 +1000,7 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac)
return 0;
if (ac->ac_criteria >= 2)
return 0;
- if (ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))
+ if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))
return 0;
return 1;
}
@@ -1689,7 +1689,7 @@ static int mb_test_and_clear_bits(void *bm, int cur, int len)
return zero_bit;
}
-void ext4_set_bits(void *bm, int cur, int len)
+void mb_set_bits(void *bm, int cur, int len)
{
__u32 *addr;
@@ -1996,7 +1996,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info);
- ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
+ mb_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
mb_check_buddy(e4b);
return ret;
@@ -3825,7 +3825,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
* We leak some of the blocks here.
*/
ext4_lock_group(sb, ac->ac_b_ex.fe_group);
- ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+ mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
ac->ac_b_ex.fe_len);
ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -3844,7 +3844,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
}
}
#endif
- ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+ mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
ac->ac_b_ex.fe_len);
if (ext4_has_group_desc_csum(sb) &&
(gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
@@ -3899,69 +3899,103 @@ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_group_t group;
ext4_grpblk_t blkoff;
- int i, clen, err;
+ int i, err;
int already;
+ unsigned int clen, clen_changed, thisgrp_len;
- clen = EXT4_B2C(sbi, len);
+ while (len > 0) {
+ ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
- ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
- bitmap_bh = ext4_read_block_bitmap(sb, group);
- if (IS_ERR(bitmap_bh)) {
- err = PTR_ERR(bitmap_bh);
- bitmap_bh = NULL;
- goto out_err;
- }
+ /*
+ * Check to see if we are freeing blocks across a group
+ * boundary.
+ * In case of flex_bg, this can happen that (block, len) may
+ * span across more than one group. In that case we need to
+ * get the corresponding group metadata to work with.
+ * For this we have goto again loop.
+ */
+ thisgrp_len = min_t(unsigned int, (unsigned int)len,
+ EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff));
+ clen = EXT4_NUM_B2C(sbi, thisgrp_len);
+
+ if (!ext4_sb_block_valid(sb, NULL, block, thisgrp_len)) {
+ ext4_error(sb, "Marking blocks in system zone - "
+ "Block = %llu, len = %u",
+ block, thisgrp_len);
+ bitmap_bh = NULL;
+ break;
+ }
- err = -EIO;
- gdp = ext4_get_group_desc(sb, group, &gdp_bh);
- if (!gdp)
- goto out_err;
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
+ if (IS_ERR(bitmap_bh)) {
+ err = PTR_ERR(bitmap_bh);
+ bitmap_bh = NULL;
+ break;
+ }
- ext4_lock_group(sb, group);
- already = 0;
- for (i = 0; i < clen; i++)
- if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) == !state)
- already++;
+ err = -EIO;
+ gdp = ext4_get_group_desc(sb, group, &gdp_bh);
+ if (!gdp)
+ break;
- if (state)
- ext4_set_bits(bitmap_bh->b_data, blkoff, clen);
- else
- mb_test_and_clear_bits(bitmap_bh->b_data, blkoff, clen);
- if (ext4_has_group_desc_csum(sb) &&
- (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
- gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
- ext4_free_group_clusters_set(sb, gdp,
- ext4_free_clusters_after_init(sb,
- group, gdp));
- }
- if (state)
- clen = ext4_free_group_clusters(sb, gdp) - clen + already;
- else
- clen = ext4_free_group_clusters(sb, gdp) + clen - already;
+ ext4_lock_group(sb, group);
+ already = 0;
+ for (i = 0; i < clen; i++)
+ if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) ==
+ !state)
+ already++;
+
+ clen_changed = clen - already;
+ if (state)
+ mb_set_bits(bitmap_bh->b_data, blkoff, clen);
+ else
+ mb_clear_bits(bitmap_bh->b_data, blkoff, clen);
+ if (ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+ ext4_free_group_clusters_set(sb, gdp,
+ ext4_free_clusters_after_init(sb, group, gdp));
+ }
+ if (state)
+ clen = ext4_free_group_clusters(sb, gdp) - clen_changed;
+ else
+ clen = ext4_free_group_clusters(sb, gdp) + clen_changed;
- ext4_free_group_clusters_set(sb, gdp, clen);
- ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
- ext4_group_desc_csum_set(sb, group, gdp);
+ ext4_free_group_clusters_set(sb, gdp, clen);
+ ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
+ ext4_group_desc_csum_set(sb, group, gdp);
- ext4_unlock_group(sb, group);
+ ext4_unlock_group(sb, group);
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t flex_group = ext4_flex_group(sbi, group);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, group);
+ struct flex_groups *fg = sbi_array_rcu_deref(sbi,
+ s_flex_groups, flex_group);
- atomic64_sub(len,
- &sbi_array_rcu_deref(sbi, s_flex_groups,
- flex_group)->free_clusters);
+ if (state)
+ atomic64_sub(clen_changed, &fg->free_clusters);
+ else
+ atomic64_add(clen_changed, &fg->free_clusters);
+
+ }
+
+ err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
+ if (err)
+ break;
+ sync_dirty_buffer(bitmap_bh);
+ err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
+ sync_dirty_buffer(gdp_bh);
+ if (err)
+ break;
+
+ block += thisgrp_len;
+ len -= thisgrp_len;
+ brelse(bitmap_bh);
+ BUG_ON(len < 0);
}
- err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
if (err)
- goto out_err;
- sync_dirty_buffer(bitmap_bh);
- err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
- sync_dirty_buffer(gdp_bh);
-
-out_err:
- brelse(bitmap_bh);
+ brelse(bitmap_bh);
}
/*
@@ -4433,7 +4467,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
while (n) {
entry = rb_entry(n, struct ext4_free_data, efd_node);
- ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
+ mb_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
n = rb_next(n);
}
return;
@@ -4474,7 +4508,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
if (unlikely(len == 0))
continue;
BUG_ON(groupnr != group);
- ext4_set_bits(bitmap, start, len);
+ mb_set_bits(bitmap, start, len);
preallocated += len;
}
mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
@@ -5846,17 +5880,17 @@ static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block,
}
/**
- * ext4_free_blocks() -- Free given blocks and update quota
+ * ext4_mb_clear_bb() -- helper function for freeing blocks.
+ * Used by ext4_free_blocks()
* @handle: handle for this transaction
* @inode: inode
- * @bh: optional buffer of the block to be freed
* @block: starting physical block to be freed
* @count: number of blocks to be freed
* @flags: flags used by ext4_free_blocks
*/
-void ext4_free_blocks(handle_t *handle, struct inode *inode,
- struct buffer_head *bh, ext4_fsblk_t block,
- unsigned long count, int flags)
+static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t block, unsigned long count,
+ int flags)
{
struct buffer_head *bitmap_bh = NULL;
struct super_block *sb = inode->i_sb;
@@ -5873,80 +5907,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
sbi = EXT4_SB(sb);
- if (sbi->s_mount_state & EXT4_FC_REPLAY) {
- ext4_free_blocks_simple(inode, block, count);
- return;
- }
-
- might_sleep();
- if (bh) {
- if (block)
- BUG_ON(block != bh->b_blocknr);
- else
- block = bh->b_blocknr;
- }
-
- if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
- !ext4_inode_block_valid(inode, block, count)) {
- ext4_error(sb, "Freeing blocks not in datazone - "
- "block = %llu, count = %lu", block, count);
- goto error_return;
- }
-
- ext4_debug("freeing block %llu\n", block);
- trace_ext4_free_blocks(inode, block, count, flags);
-
- if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
- BUG_ON(count > 1);
-
- ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
- inode, bh, block);
- }
-
- /*
- * If the extent to be freed does not begin on a cluster
- * boundary, we need to deal with partial clusters at the
- * beginning and end of the extent. Normally we will free
- * blocks at the beginning or the end unless we are explicitly
- * requested to avoid doing so.
- */
- overflow = EXT4_PBLK_COFF(sbi, block);
- if (overflow) {
- if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
- overflow = sbi->s_cluster_ratio - overflow;
- block += overflow;
- if (count > overflow)
- count -= overflow;
- else
- return;
- } else {
- block -= overflow;
- count += overflow;
- }
- }
- overflow = EXT4_LBLK_COFF(sbi, count);
- if (overflow) {
- if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
- if (count > overflow)
- count -= overflow;
- else
- return;
- } else
- count += sbi->s_cluster_ratio - overflow;
- }
-
- if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
- int i;
- int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
-
- for (i = 0; i < count; i++) {
- cond_resched();
- if (is_metadata)
- bh = sb_find_get_block(inode->i_sb, block + i);
- ext4_forget(handle, is_metadata, inode, bh, block + i);
- }
- }
-
do_more:
overflow = 0;
ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -5977,13 +5937,7 @@ do_more:
goto error_return;
}
- if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
- in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
- in_range(block, ext4_inode_table(sb, gdp),
- sbi->s_itb_per_group) ||
- in_range(block + count - 1, ext4_inode_table(sb, gdp),
- sbi->s_itb_per_group)) {
-
+ if (!ext4_inode_block_valid(inode, block, count)) {
ext4_error(sb, "Freeing blocks in system zone - "
"Block = %llu, count = %lu", block, count);
/* err = 0. ext4_std_error should be a no op */
@@ -6054,7 +6008,7 @@ do_more:
NULL);
if (err && err != -EOPNOTSUPP)
ext4_msg(sb, KERN_WARNING, "discard request in"
- " group:%d block:%d count:%lu failed"
+ " group:%u block:%d count:%lu failed"
" with %d", block_group, bit, count,
err);
} else
@@ -6115,6 +6069,103 @@ error_return:
}
/**
+ * ext4_free_blocks() -- Free given blocks and update quota
+ * @handle: handle for this transaction
+ * @inode: inode
+ * @bh: optional buffer of the block to be freed
+ * @block: starting physical block to be freed
+ * @count: number of blocks to be freed
+ * @flags: flags used by ext4_free_blocks
+ */
+void ext4_free_blocks(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, ext4_fsblk_t block,
+ unsigned long count, int flags)
+{
+ struct super_block *sb = inode->i_sb;
+ unsigned int overflow;
+ struct ext4_sb_info *sbi;
+
+ sbi = EXT4_SB(sb);
+
+ if (sbi->s_mount_state & EXT4_FC_REPLAY) {
+ ext4_free_blocks_simple(inode, block, count);
+ return;
+ }
+
+ might_sleep();
+ if (bh) {
+ if (block)
+ BUG_ON(block != bh->b_blocknr);
+ else
+ block = bh->b_blocknr;
+ }
+
+ if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
+ !ext4_inode_block_valid(inode, block, count)) {
+ ext4_error(sb, "Freeing blocks not in datazone - "
+ "block = %llu, count = %lu", block, count);
+ return;
+ }
+
+ ext4_debug("freeing block %llu\n", block);
+ trace_ext4_free_blocks(inode, block, count, flags);
+
+ if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
+ BUG_ON(count > 1);
+
+ ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
+ inode, bh, block);
+ }
+
+ /*
+ * If the extent to be freed does not begin on a cluster
+ * boundary, we need to deal with partial clusters at the
+ * beginning and end of the extent. Normally we will free
+ * blocks at the beginning or the end unless we are explicitly
+ * requested to avoid doing so.
+ */
+ overflow = EXT4_PBLK_COFF(sbi, block);
+ if (overflow) {
+ if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
+ overflow = sbi->s_cluster_ratio - overflow;
+ block += overflow;
+ if (count > overflow)
+ count -= overflow;
+ else
+ return;
+ } else {
+ block -= overflow;
+ count += overflow;
+ }
+ }
+ overflow = EXT4_LBLK_COFF(sbi, count);
+ if (overflow) {
+ if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
+ if (count > overflow)
+ count -= overflow;
+ else
+ return;
+ } else
+ count += sbi->s_cluster_ratio - overflow;
+ }
+
+ if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
+ int i;
+ int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
+
+ for (i = 0; i < count; i++) {
+ cond_resched();
+ if (is_metadata)
+ bh = sb_find_get_block(inode->i_sb, block + i);
+ ext4_forget(handle, is_metadata, inode, bh, block + i);
+ }
+ }
+
+ ext4_mb_clear_bb(handle, inode, block, count, flags);
+ return;
+}
+
+/**
* ext4_group_add_blocks() -- Add given blocks to an existing group
* @handle: handle to this transaction
* @sb: super block
@@ -6170,11 +6221,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
goto error_return;
}
- if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
- in_range(ext4_inode_bitmap(sb, desc), block, count) ||
- in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
- in_range(block + count - 1, ext4_inode_table(sb, desc),
- sbi->s_itb_per_group)) {
+ if (!ext4_sb_block_valid(sb, NULL, block, count)) {
ext4_error(sb, "Adding blocks in system zones - "
"Block = %llu, count = %lu",
block, count);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 8cf0a924a49b..767b4bfe39c3 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1466,10 +1466,10 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
de = (struct ext4_dir_entry_2 *)search_buf;
dlimit = search_buf + buf_size;
- while ((char *) de < dlimit) {
+ while ((char *) de < dlimit - EXT4_BASE_DIR_LEN) {
/* this code is executed quadratically often */
/* do minimal checking `by hand' */
- if ((char *) de + de->name_len <= dlimit &&
+ if (de->name + de->name_len <= dlimit &&
ext4_match(dir, fname, de)) {
/* found a match - just to be sure, do
* a full check */
@@ -2997,14 +2997,14 @@ bool ext4_empty_dir(struct inode *inode)
if (inode->i_size < ext4_dir_rec_len(1, NULL) +
ext4_dir_rec_len(2, NULL)) {
EXT4_ERROR_INODE(inode, "invalid size");
- return true;
+ return false;
}
/* The first directory block must not be a hole,
* so treat it as DIRENT_HTREE
*/
bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE);
if (IS_ERR(bh))
- return true;
+ return false;
de = (struct ext4_dir_entry_2 *) bh->b_data;
if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size,
@@ -3012,7 +3012,7 @@ bool ext4_empty_dir(struct inode *inode)
le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) {
ext4_warning_inode(inode, "directory missing '.'");
brelse(bh);
- return true;
+ return false;
}
offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
de = ext4_next_entry(de, sb->s_blocksize);
@@ -3021,7 +3021,7 @@ bool ext4_empty_dir(struct inode *inode)
le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) {
ext4_warning_inode(inode, "directory missing '..'");
brelse(bh);
- return true;
+ return false;
}
offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
while (offset < inode->i_size) {
@@ -3035,7 +3035,7 @@ bool ext4_empty_dir(struct inode *inode)
continue;
}
if (IS_ERR(bh))
- return true;
+ return false;
}
de = (struct ext4_dir_entry_2 *) (bh->b_data +
(offset & (sb->s_blocksize - 1)));
@@ -3891,12 +3891,19 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
ext4_fc_mark_ineligible(old.inode->i_sb,
EXT4_FC_REASON_RENAME_DIR, handle);
} else {
+ struct super_block *sb = old.inode->i_sb;
+
if (new.inode)
ext4_fc_track_unlink(handle, new.dentry);
- __ext4_fc_track_link(handle, old.inode, new.dentry);
- __ext4_fc_track_unlink(handle, old.inode, old.dentry);
- if (whiteout)
- __ext4_fc_track_create(handle, whiteout, old.dentry);
+ if (test_opt2(sb, JOURNAL_FAST_COMMIT) &&
+ !(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) &&
+ !(ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE))) {
+ __ext4_fc_track_link(handle, old.inode, new.dentry);
+ __ext4_fc_track_unlink(handle, old.inode, old.dentry);
+ if (whiteout)
+ __ext4_fc_track_create(handle, whiteout,
+ old.dentry);
+ }
}
if (new.inode) {
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 1d370364230e..14695e2b5042 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -134,8 +134,10 @@ static void ext4_finish_bio(struct bio *bio)
continue;
}
clear_buffer_async_write(bh);
- if (bio->bi_status)
+ if (bio->bi_status) {
+ set_buffer_write_io_error(bh);
buffer_io_error(bh);
+ }
} while ((bh = bh->b_this_page) != head);
spin_unlock_irqrestore(&head->b_uptodate_lock, flags);
if (!under_io) {
@@ -323,10 +325,9 @@ static void ext4_end_bio(struct bio *bio)
{
ext4_io_end_t *io_end = bio->bi_private;
sector_t bi_sector = bio->bi_iter.bi_sector;
- char b[BDEVNAME_SIZE];
- if (WARN_ONCE(!io_end, "io_end is NULL: %s: sector %Lu len %u err %d\n",
- bio_devname(bio, b),
+ if (WARN_ONCE(!io_end, "io_end is NULL: %pg: sector %Lu len %u err %d\n",
+ bio->bi_bdev,
(long long) bio->bi_iter.bi_sector,
(unsigned) bio_sectors(bio),
bio->bi_status)) {
@@ -372,10 +373,8 @@ void ext4_io_submit(struct ext4_io_submit *io)
struct bio *bio = io->io_bio;
if (bio) {
- int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ?
- REQ_SYNC : 0;
- io->io_bio->bi_write_hint = io->io_end->inode->i_write_hint;
- bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags);
+ if (io->io_wbc->sync_mode == WB_SYNC_ALL)
+ io->io_bio->bi_opf |= REQ_SYNC;
submit_bio(io->io_bio);
}
io->io_bio = NULL;
@@ -398,10 +397,9 @@ static void io_submit_init_bio(struct ext4_io_submit *io,
* bio_alloc will _always_ be able to allocate a bio if
* __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset().
*/
- bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS);
+ bio = bio_alloc(bh->b_bdev, BIO_MAX_VECS, REQ_OP_WRITE, GFP_NOIO);
fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
- bio_set_dev(bio, bh->b_bdev);
bio->bi_end_io = ext4_end_bio;
bio->bi_private = ext4_get_io_end(io->io_end);
io->io_bio = bio;
@@ -421,10 +419,8 @@ static void io_submit_add_bh(struct ext4_io_submit *io,
submit_and_retry:
ext4_io_submit(io);
}
- if (io->io_bio == NULL) {
+ if (io->io_bio == NULL)
io_submit_init_bio(io, bh);
- io->io_bio->bi_write_hint = inode->i_write_hint;
- }
ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
if (ret != bh->b_size)
goto submit_and_retry;
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 4cd62f1d848c..af491e170c4a 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -109,7 +109,7 @@ static void verity_work(struct work_struct *work)
struct bio *bio = ctx->bio;
/*
- * fsverity_verify_bio() may call readpages() again, and although verity
+ * fsverity_verify_bio() may call readahead() again, and although verity
* will be disabled for that, decryption may still be needed, causing
* another bio_post_read_ctx to be allocated. So to guarantee that
* mempool_alloc() never deadlocks we must free the current ctx first.
@@ -365,15 +365,15 @@ int ext4_mpage_readpages(struct inode *inode,
* bio_alloc will _always_ be able to allocate a bio if
* __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset().
*/
- bio = bio_alloc(GFP_KERNEL, bio_max_segs(nr_pages));
+ bio = bio_alloc(bdev, bio_max_segs(nr_pages),
+ REQ_OP_READ, GFP_KERNEL);
fscrypt_set_bio_crypt_ctx(bio, inode, next_block,
GFP_KERNEL);
ext4_set_bio_post_read_ctx(bio, inode, page->index);
- bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
bio->bi_end_io = mpage_end_io;
- bio_set_op_attrs(bio, REQ_OP_READ,
- rac ? REQ_RAHEAD : 0);
+ if (rac)
+ bio->bi_opf |= REQ_RAHEAD;
}
length = first_hole << blkbits;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index ee8f02f406cb..90a941d20dff 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -14,6 +14,7 @@
#include <linux/errno.h>
#include <linux/slab.h>
+#include <linux/jiffies.h>
#include "ext4_jbd2.h"
@@ -483,7 +484,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
}
ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n",
first_cluster, first_cluster - start, count2);
- ext4_set_bits(bh->b_data, first_cluster - start, count2);
+ mb_set_bits(bh->b_data, first_cluster - start, count2);
err = ext4_handle_dirty_metadata(handle, NULL, bh);
brelse(bh);
@@ -632,7 +633,7 @@ handle_bb:
if (overhead != 0) {
ext4_debug("mark backup superblock %#04llx (+0)\n",
start);
- ext4_set_bits(bh->b_data, 0,
+ mb_set_bits(bh->b_data, 0,
EXT4_NUM_B2C(sbi, overhead));
}
ext4_mark_bitmap_end(EXT4_B2C(sbi, group_data[i].blocks_count),
@@ -2100,7 +2101,7 @@ retry:
*/
while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count,
flexbg_size)) {
- if (jiffies - last_update_time > HZ * 10) {
+ if (time_is_before_jiffies(last_update_time + HZ * 10)) {
if (last_update_time)
ext4_msg(sb, KERN_INFO,
"resized to %llu blocks",
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c5021ca0a28a..1466fbdbc8e3 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1199,20 +1199,25 @@ static void ext4_put_super(struct super_block *sb)
int aborted = 0;
int i, err;
- ext4_unregister_li_request(sb);
- ext4_quota_off_umount(sb);
-
- flush_work(&sbi->s_error_work);
- destroy_workqueue(sbi->rsv_conversion_wq);
- ext4_release_orphan_info(sb);
-
/*
* Unregister sysfs before destroying jbd2 journal.
* Since we could still access attr_journal_task attribute via sysfs
* path which could have sbi->s_journal->j_task as NULL
+ * Unregister sysfs before flush sbi->s_error_work.
+ * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If
+ * read metadata verify failed then will queue error work.
+ * flush_stashed_error_work will call start_this_handle may trigger
+ * BUG_ON.
*/
ext4_unregister_sysfs(sb);
+ ext4_unregister_li_request(sb);
+ ext4_quota_off_umount(sb);
+
+ flush_work(&sbi->s_error_work);
+ destroy_workqueue(sbi->rsv_conversion_wq);
+ ext4_release_orphan_info(sb);
+
if (sbi->s_journal) {
aborted = is_journal_aborted(sbi->s_journal);
err = jbd2_journal_destroy(sbi->s_journal);
@@ -1316,7 +1321,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
{
struct ext4_inode_info *ei;
- ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
+ ei = alloc_inode_sb(sb, ext4_inode_cachep, GFP_NOFS);
if (!ei)
return NULL;
@@ -2021,12 +2026,12 @@ static int ext4_set_test_dummy_encryption(struct super_block *sb, char *arg)
#define EXT4_SPEC_s_commit_interval (1 << 16)
#define EXT4_SPEC_s_fc_debug_max_replay (1 << 17)
#define EXT4_SPEC_s_sb_block (1 << 18)
+#define EXT4_SPEC_mb_optimize_scan (1 << 19)
struct ext4_fs_context {
char *s_qf_names[EXT4_MAXQUOTAS];
char *test_dummy_enc_arg;
int s_jquota_fmt; /* Format of quota to use */
- int mb_optimize_scan;
#ifdef CONFIG_EXT4_DEBUG
int s_fc_debug_max_replay;
#endif
@@ -2045,8 +2050,8 @@ struct ext4_fs_context {
unsigned int mask_s_mount_opt;
unsigned int vals_s_mount_opt2;
unsigned int mask_s_mount_opt2;
- unsigned int vals_s_mount_flags;
- unsigned int mask_s_mount_flags;
+ unsigned long vals_s_mount_flags;
+ unsigned long mask_s_mount_flags;
unsigned int opt_flags; /* MOPT flags */
unsigned int spec;
u32 s_max_batch_time;
@@ -2149,23 +2154,36 @@ static inline void ctx_set_##name(struct ext4_fs_context *ctx, \
{ \
ctx->mask_s_##name |= flag; \
ctx->vals_s_##name |= flag; \
-} \
+}
+
+#define EXT4_CLEAR_CTX(name) \
static inline void ctx_clear_##name(struct ext4_fs_context *ctx, \
unsigned long flag) \
{ \
ctx->mask_s_##name |= flag; \
ctx->vals_s_##name &= ~flag; \
-} \
+}
+
+#define EXT4_TEST_CTX(name) \
static inline unsigned long \
ctx_test_##name(struct ext4_fs_context *ctx, unsigned long flag) \
{ \
return (ctx->vals_s_##name & flag); \
-} \
+}
-EXT4_SET_CTX(flags);
+EXT4_SET_CTX(flags); /* set only */
EXT4_SET_CTX(mount_opt);
+EXT4_CLEAR_CTX(mount_opt);
+EXT4_TEST_CTX(mount_opt);
EXT4_SET_CTX(mount_opt2);
-EXT4_SET_CTX(mount_flags);
+EXT4_CLEAR_CTX(mount_opt2);
+EXT4_TEST_CTX(mount_opt2);
+
+static inline void ctx_set_mount_flag(struct ext4_fs_context *ctx, int bit)
+{
+ set_bit(bit, &ctx->mask_s_mount_flags);
+ set_bit(bit, &ctx->vals_s_mount_flags);
+}
static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
@@ -2235,7 +2253,7 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
param->key);
return 0;
case Opt_abort:
- ctx_set_mount_flags(ctx, EXT4_MF_FS_ABORTED);
+ ctx_set_mount_flag(ctx, EXT4_MF_FS_ABORTED);
return 0;
case Opt_i_version:
ext4_msg(NULL, KERN_WARNING, deprecated_msg, param->key, "5.20");
@@ -2451,12 +2469,17 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
ctx_clear_mount_opt(ctx, m->mount_opt);
return 0;
case Opt_mb_optimize_scan:
- if (result.int_32 != 0 && result.int_32 != 1) {
+ if (result.int_32 == 1) {
+ ctx_set_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN);
+ ctx->spec |= EXT4_SPEC_mb_optimize_scan;
+ } else if (result.int_32 == 0) {
+ ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN);
+ ctx->spec |= EXT4_SPEC_mb_optimize_scan;
+ } else {
ext4_msg(NULL, KERN_WARNING,
"mb_optimize_scan should be set to 0 or 1.");
return -EINVAL;
}
- ctx->mb_optimize_scan = result.int_32;
return 0;
}
@@ -3468,8 +3491,9 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files)
*/
static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
{
- unsigned long long upper_limit, res = EXT4_NDIR_BLOCKS;
+ loff_t upper_limit, res = EXT4_NDIR_BLOCKS;
int meta_blocks;
+ unsigned int ppb = 1 << (bits - 2);
/*
* This is calculated to be the largest file size for a dense, block
@@ -3501,27 +3525,42 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
}
+ /* Compute how many blocks we can address by block tree */
+ res += ppb;
+ res += ppb * ppb;
+ res += ((loff_t)ppb) * ppb * ppb;
+ /* Compute how many metadata blocks are needed */
+ meta_blocks = 1;
+ meta_blocks += 1 + ppb;
+ meta_blocks += 1 + ppb + ppb * ppb;
+ /* Does block tree limit file size? */
+ if (res + meta_blocks <= upper_limit)
+ goto check_lfs;
+
+ res = upper_limit;
+ /* How many metadata blocks are needed for addressing upper_limit? */
+ upper_limit -= EXT4_NDIR_BLOCKS;
/* indirect blocks */
meta_blocks = 1;
+ upper_limit -= ppb;
/* double indirect blocks */
- meta_blocks += 1 + (1LL << (bits-2));
- /* tripple indirect blocks */
- meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
-
- upper_limit -= meta_blocks;
- upper_limit <<= bits;
-
- res += 1LL << (bits-2);
- res += 1LL << (2*(bits-2));
- res += 1LL << (3*(bits-2));
+ if (upper_limit < ppb * ppb) {
+ meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb);
+ res -= meta_blocks;
+ goto check_lfs;
+ }
+ meta_blocks += 1 + ppb;
+ upper_limit -= ppb * ppb;
+ /* tripple indirect blocks for the rest */
+ meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb) +
+ DIV_ROUND_UP_ULL(upper_limit, ppb*ppb);
+ res -= meta_blocks;
+check_lfs:
res <<= bits;
- if (res > upper_limit)
- res = upper_limit;
-
if (res > MAX_LFS_FILESIZE)
res = MAX_LFS_FILESIZE;
- return (loff_t)res;
+ return res;
}
static ext4_fsblk_t descriptor_loc(struct super_block *sb,
@@ -4138,9 +4177,11 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp,
ext4_fsblk_t first_block, last_block, b;
ext4_group_t i, ngroups = ext4_get_groups_count(sb);
int s, j, count = 0;
+ int has_super = ext4_bg_has_super(sb, grp);
if (!ext4_has_feature_bigalloc(sb))
- return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) +
+ return (has_super + ext4_bg_num_gdb(sb, grp) +
+ (has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) +
sbi->s_itb_per_group + 2);
first_block = le32_to_cpu(sbi->s_es->s_first_data_block) +
@@ -4369,7 +4410,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
/* Set defaults for the variables that will be set during parsing */
ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
- ctx->mb_optimize_scan = DEFAULT_MB_OPTIMIZE_SCAN;
sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
sbi->s_sectors_written_start =
@@ -5249,9 +5289,18 @@ no_journal:
* Get the # of file system overhead blocks from the
* superblock if present.
*/
- if (es->s_overhead_clusters)
- sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
- else {
+ sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
+ /* ignore the precalculated value if it is ridiculous */
+ if (sbi->s_overhead > ext4_blocks_count(es))
+ sbi->s_overhead = 0;
+ /*
+ * If the bigalloc feature is not enabled recalculating the
+ * overhead doesn't take long, so we might as well just redo
+ * it to make sure we are using the correct value.
+ */
+ if (!ext4_has_feature_bigalloc(sb))
+ sbi->s_overhead = 0;
+ if (sbi->s_overhead == 0) {
err = ext4_calculate_overhead(sb);
if (err)
goto failed_mount_wq;
@@ -5320,12 +5369,12 @@ no_journal:
* turned off by passing "mb_optimize_scan=0". This can also be
* turned on forcefully by passing "mb_optimize_scan=1".
*/
- if (ctx->mb_optimize_scan == 1)
- set_opt2(sb, MB_OPTIMIZE_SCAN);
- else if (ctx->mb_optimize_scan == 0)
- clear_opt2(sb, MB_OPTIMIZE_SCAN);
- else if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD)
- set_opt2(sb, MB_OPTIMIZE_SCAN);
+ if (!(ctx->spec & EXT4_SPEC_mb_optimize_scan)) {
+ if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD)
+ set_opt2(sb, MB_OPTIMIZE_SCAN);
+ else
+ clear_opt2(sb, MB_OPTIMIZE_SCAN);
+ }
err = ext4_mb_init(sb);
if (err) {
@@ -5569,6 +5618,8 @@ static int ext4_fill_super(struct super_block *sb, struct fs_context *fc)
ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
"Quota mode: %s.", descr, ext4_quota_mode(sb));
+ /* Update the s_overhead_clusters if necessary */
+ ext4_update_overhead(sb);
return 0;
free_sbi:
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index f46a7339d6cf..03ef087537c7 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -143,3 +143,10 @@ config F2FS_IOSTAT
Support getting IO statistics through sysfs and printing out periodic
IO statistics tracepoint events. You have to turn on "iostat_enable"
sysfs node to enable this feature.
+
+config F2FS_UNFAIR_RWSEM
+ bool "F2FS unfair rw_semaphore"
+ depends on F2FS_FS && BLK_CGROUP
+ help
+ Use unfair rw_semaphore, if system configured IO priority by block
+ cgroup.
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 16e826e01f09..eaa240b21f07 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -204,8 +204,9 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type, bool rcu)
return __f2fs_get_acl(inode, type, NULL);
}
-static int f2fs_acl_update_mode(struct inode *inode, umode_t *mode_p,
- struct posix_acl **acl)
+static int f2fs_acl_update_mode(struct user_namespace *mnt_userns,
+ struct inode *inode, umode_t *mode_p,
+ struct posix_acl **acl)
{
umode_t mode = inode->i_mode;
int error;
@@ -218,14 +219,15 @@ static int f2fs_acl_update_mode(struct inode *inode, umode_t *mode_p,
return error;
if (error == 0)
*acl = NULL;
- if (!in_group_p(i_gid_into_mnt(&init_user_ns, inode)) &&
- !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID))
+ if (!in_group_p(i_gid_into_mnt(mnt_userns, inode)) &&
+ !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
mode &= ~S_ISGID;
*mode_p = mode;
return 0;
}
-static int __f2fs_set_acl(struct inode *inode, int type,
+static int __f2fs_set_acl(struct user_namespace *mnt_userns,
+ struct inode *inode, int type,
struct posix_acl *acl, struct page *ipage)
{
int name_index;
@@ -238,7 +240,8 @@ static int __f2fs_set_acl(struct inode *inode, int type,
case ACL_TYPE_ACCESS:
name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
if (acl && !ipage) {
- error = f2fs_acl_update_mode(inode, &mode, &acl);
+ error = f2fs_acl_update_mode(mnt_userns, inode,
+ &mode, &acl);
if (error)
return error;
set_acl_inode(inode, mode);
@@ -279,7 +282,7 @@ int f2fs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
return -EIO;
- return __f2fs_set_acl(inode, type, acl, NULL);
+ return __f2fs_set_acl(mnt_userns, inode, type, acl, NULL);
}
/*
@@ -419,7 +422,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
f2fs_mark_inode_dirty_sync(inode, true);
if (default_acl) {
- error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl,
+ error = __f2fs_set_acl(NULL, inode, ACL_TYPE_DEFAULT, default_acl,
ipage);
posix_acl_release(default_acl);
} else {
@@ -427,7 +430,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
}
if (acl) {
if (!error)
- error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl,
+ error = __f2fs_set_acl(NULL, inode, ACL_TYPE_ACCESS, acl,
ipage);
posix_acl_release(acl);
} else {
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 982f0170639f..909085a78f9c 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -98,6 +98,13 @@ repeat:
}
if (unlikely(!PageUptodate(page))) {
+ if (page->index == sbi->metapage_eio_ofs) {
+ if (sbi->metapage_eio_cnt++ == MAX_RETRY_META_PAGE_EIO)
+ set_ckpt_flags(sbi, CP_ERROR_FLAG);
+ } else {
+ sbi->metapage_eio_ofs = page->index;
+ sbi->metapage_eio_cnt = 0;
+ }
f2fs_put_page(page, 1);
return ERR_PTR(-EIO);
}
@@ -282,18 +289,22 @@ out:
return blkno - start;
}
-void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index)
+void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index,
+ unsigned int ra_blocks)
{
struct page *page;
bool readahead = false;
+ if (ra_blocks == RECOVERY_MIN_RA_BLOCKS)
+ return;
+
page = find_get_page(META_MAPPING(sbi), index);
if (!page || !PageUptodate(page))
readahead = true;
f2fs_put_page(page, 0);
if (readahead)
- f2fs_ra_meta_pages(sbi, index, BIO_MAX_VECS, META_POR, true);
+ f2fs_ra_meta_pages(sbi, index, ra_blocks, META_POR, true);
}
static int __f2fs_write_meta_page(struct page *page,
@@ -351,13 +362,13 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
goto skip_write;
/* if locked failed, cp will flush dirty pages instead */
- if (!down_write_trylock(&sbi->cp_global_sem))
+ if (!f2fs_down_write_trylock(&sbi->cp_global_sem))
goto skip_write;
trace_f2fs_writepages(mapping->host, wbc, META);
diff = nr_pages_to_write(sbi, META, wbc);
written = f2fs_sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO);
- up_write(&sbi->cp_global_sem);
+ f2fs_up_write(&sbi->cp_global_sem);
wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff);
return 0;
@@ -436,26 +447,27 @@ stop:
return nwritten;
}
-static int f2fs_set_meta_page_dirty(struct page *page)
+static bool f2fs_dirty_meta_folio(struct address_space *mapping,
+ struct folio *folio)
{
- trace_f2fs_set_page_dirty(page, META);
-
- if (!PageUptodate(page))
- SetPageUptodate(page);
- if (!PageDirty(page)) {
- __set_page_dirty_nobuffers(page);
- inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
- set_page_private_reference(page);
- return 1;
+ trace_f2fs_set_page_dirty(&folio->page, META);
+
+ if (!folio_test_uptodate(folio))
+ folio_mark_uptodate(folio);
+ if (!folio_test_dirty(folio)) {
+ filemap_dirty_folio(mapping, folio);
+ inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_META);
+ set_page_private_reference(&folio->page);
+ return true;
}
- return 0;
+ return false;
}
const struct address_space_operations f2fs_meta_aops = {
.writepage = f2fs_write_meta_page,
.writepages = f2fs_write_meta_pages,
- .set_page_dirty = f2fs_set_meta_page_dirty,
- .invalidatepage = f2fs_invalidate_page,
+ .dirty_folio = f2fs_dirty_meta_folio,
+ .invalidate_folio = f2fs_invalidate_folio,
.releasepage = f2fs_release_page,
#ifdef CONFIG_MIGRATION
.migratepage = f2fs_migrate_page,
@@ -864,6 +876,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
struct page *cp_page_1 = NULL, *cp_page_2 = NULL;
struct f2fs_checkpoint *cp_block = NULL;
unsigned long long cur_version = 0, pre_version = 0;
+ unsigned int cp_blocks;
int err;
err = get_checkpoint_version(sbi, cp_addr, &cp_block,
@@ -871,15 +884,16 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
if (err)
return NULL;
- if (le32_to_cpu(cp_block->cp_pack_total_block_count) >
- sbi->blocks_per_seg) {
+ cp_blocks = le32_to_cpu(cp_block->cp_pack_total_block_count);
+
+ if (cp_blocks > sbi->blocks_per_seg || cp_blocks <= F2FS_CP_PACKS) {
f2fs_warn(sbi, "invalid cp_pack_total_block_count:%u",
le32_to_cpu(cp_block->cp_pack_total_block_count));
goto invalid_cp;
}
pre_version = *version;
- cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1;
+ cp_addr += cp_blocks - 1;
err = get_checkpoint_version(sbi, cp_addr, &cp_block,
&cp_page_2, version);
if (err)
@@ -1014,7 +1028,7 @@ static void __remove_dirty_inode(struct inode *inode, enum inode_type type)
stat_dec_dirty_inode(F2FS_I_SB(inode), type);
}
-void f2fs_update_dirty_page(struct inode *inode, struct page *page)
+void f2fs_update_dirty_folio(struct inode *inode, struct folio *folio)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE;
@@ -1029,7 +1043,7 @@ void f2fs_update_dirty_page(struct inode *inode, struct page *page)
inode_inc_dirty_pages(inode);
spin_unlock(&sbi->inode_lock[type]);
- set_page_private_reference(page);
+ set_page_private_reference(&folio->page);
}
void f2fs_remove_dirty_inode(struct inode *inode)
@@ -1159,7 +1173,7 @@ static bool __need_flush_quota(struct f2fs_sb_info *sbi)
if (!is_journalled_quota(sbi))
return false;
- if (!down_write_trylock(&sbi->quota_sem))
+ if (!f2fs_down_write_trylock(&sbi->quota_sem))
return true;
if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) {
ret = false;
@@ -1171,7 +1185,7 @@ static bool __need_flush_quota(struct f2fs_sb_info *sbi)
} else if (get_pages(sbi, F2FS_DIRTY_QDATA)) {
ret = true;
}
- up_write(&sbi->quota_sem);
+ f2fs_up_write(&sbi->quota_sem);
return ret;
}
@@ -1228,10 +1242,10 @@ retry_flush_dents:
* POR: we should ensure that there are no dirty node pages
* until finishing nat/sit flush. inode->i_blocks can be updated.
*/
- down_write(&sbi->node_change);
+ f2fs_down_write(&sbi->node_change);
if (get_pages(sbi, F2FS_DIRTY_IMETA)) {
- up_write(&sbi->node_change);
+ f2fs_up_write(&sbi->node_change);
f2fs_unlock_all(sbi);
err = f2fs_sync_inode_meta(sbi);
if (err)
@@ -1241,15 +1255,15 @@ retry_flush_dents:
}
retry_flush_nodes:
- down_write(&sbi->node_write);
+ f2fs_down_write(&sbi->node_write);
if (get_pages(sbi, F2FS_DIRTY_NODES)) {
- up_write(&sbi->node_write);
+ f2fs_up_write(&sbi->node_write);
atomic_inc(&sbi->wb_sync_req[NODE]);
err = f2fs_sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO);
atomic_dec(&sbi->wb_sync_req[NODE]);
if (err) {
- up_write(&sbi->node_change);
+ f2fs_up_write(&sbi->node_change);
f2fs_unlock_all(sbi);
return err;
}
@@ -1262,13 +1276,13 @@ retry_flush_nodes:
* dirty node blocks and some checkpoint values by block allocation.
*/
__prepare_cp_block(sbi);
- up_write(&sbi->node_change);
+ f2fs_up_write(&sbi->node_change);
return err;
}
static void unblock_operations(struct f2fs_sb_info *sbi)
{
- up_write(&sbi->node_write);
+ f2fs_up_write(&sbi->node_write);
f2fs_unlock_all(sbi);
}
@@ -1543,6 +1557,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* update user_block_counts */
sbi->last_valid_block_count = sbi->total_valid_block_count;
percpu_counter_set(&sbi->alloc_valid_block_count, 0);
+ percpu_counter_set(&sbi->rf_node_block_count, 0);
/* Here, we have one bio having CP pack except cp pack 2 page */
f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
@@ -1612,7 +1627,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
f2fs_warn(sbi, "Start checkpoint disabled!");
}
if (cpc->reason != CP_RESIZE)
- down_write(&sbi->cp_global_sem);
+ f2fs_down_write(&sbi->cp_global_sem);
if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) ||
@@ -1693,7 +1708,7 @@ stop:
trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
out:
if (cpc->reason != CP_RESIZE)
- up_write(&sbi->cp_global_sem);
+ f2fs_up_write(&sbi->cp_global_sem);
return err;
}
@@ -1741,9 +1756,9 @@ static int __write_checkpoint_sync(struct f2fs_sb_info *sbi)
struct cp_control cpc = { .reason = CP_SYNC, };
int err;
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
err = f2fs_write_checkpoint(sbi, &cpc);
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->gc_lock);
return err;
}
@@ -1831,9 +1846,9 @@ int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi)
if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC) {
int ret;
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
ret = f2fs_write_checkpoint(sbi, &cpc);
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->gc_lock);
return ret;
}
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index d0c3aeba5945..12a56f9e1572 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -314,10 +314,9 @@ static int lz4_decompress_pages(struct decompress_io_ctx *dic)
}
if (ret != PAGE_SIZE << dic->log_cluster_size) {
- printk_ratelimited("%sF2FS-fs (%s): lz4 invalid rlen:%zu, "
+ printk_ratelimited("%sF2FS-fs (%s): lz4 invalid ret:%d, "
"expected:%lu\n", KERN_ERR,
- F2FS_I_SB(dic->inode)->sb->s_id,
- dic->rlen,
+ F2FS_I_SB(dic->inode)->sb->s_id, ret,
PAGE_SIZE << dic->log_cluster_size);
return -EIO;
}
@@ -1267,7 +1266,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
* checkpoint. This can only happen to quota writes which can cause
* the below discard race condition.
*/
- down_read(&sbi->node_write);
+ f2fs_down_read(&sbi->node_write);
} else if (!f2fs_trylock_op(sbi)) {
goto out_free;
}
@@ -1384,7 +1383,7 @@ unlock_continue:
f2fs_put_dnode(&dn);
if (IS_NOQUOTA(inode))
- up_read(&sbi->node_write);
+ f2fs_up_read(&sbi->node_write);
else
f2fs_unlock_op(sbi);
@@ -1410,7 +1409,7 @@ out_put_dnode:
f2fs_put_dnode(&dn);
out_unlock_op:
if (IS_NOQUOTA(inode))
- up_read(&sbi->node_write);
+ f2fs_up_read(&sbi->node_write);
else
f2fs_unlock_op(sbi);
out_free:
@@ -1505,9 +1504,7 @@ continue_unlock:
if (IS_NOQUOTA(cc->inode))
return 0;
ret = 0;
- cond_resched();
- congestion_wait(BLK_RW_ASYNC,
- DEFAULT_IO_TIMEOUT);
+ f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
goto retry_write;
}
return ret;
@@ -1750,7 +1747,7 @@ unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn)
const struct address_space_operations f2fs_compress_aops = {
.releasepage = f2fs_release_page,
- .invalidatepage = f2fs_invalidate_page,
+ .invalidate_folio = f2fs_invalidate_folio,
};
struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 8c417864c66a..9a1a526f2092 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -164,7 +164,7 @@ static void f2fs_verify_bio(struct work_struct *work)
bool may_have_compressed_pages = (ctx->enabled_steps & STEP_DECOMPRESS);
/*
- * fsverity_verify_bio() may call readpages() again, and while verity
+ * fsverity_verify_bio() may call readahead() again, and while verity
* will be disabled for this, decryption and/or decompression may still
* be needed, resulting in another bio_post_read_ctx being allocated.
* So to prevent deadlocks we need to release the current ctx to the
@@ -354,7 +354,7 @@ static void f2fs_write_end_io(struct bio *bio)
}
struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
- block_t blk_addr, struct bio *bio)
+ block_t blk_addr, sector_t *sector)
{
struct block_device *bdev = sbi->sb->s_bdev;
int i;
@@ -369,10 +369,9 @@ struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
}
}
}
- if (bio) {
- bio_set_dev(bio, bdev);
- bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
- }
+
+ if (sector)
+ *sector = SECTOR_FROM_BLOCK(blk_addr);
return bdev;
}
@@ -389,22 +388,55 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr)
return 0;
}
+static unsigned int f2fs_io_flags(struct f2fs_io_info *fio)
+{
+ unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1;
+ unsigned int fua_flag, meta_flag, io_flag;
+ unsigned int op_flags = 0;
+
+ if (fio->op != REQ_OP_WRITE)
+ return 0;
+ if (fio->type == DATA)
+ io_flag = fio->sbi->data_io_flag;
+ else if (fio->type == NODE)
+ io_flag = fio->sbi->node_io_flag;
+ else
+ return 0;
+
+ fua_flag = io_flag & temp_mask;
+ meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask;
+
+ /*
+ * data/node io flag bits per temp:
+ * REQ_META | REQ_FUA |
+ * 5 | 4 | 3 | 2 | 1 | 0 |
+ * Cold | Warm | Hot | Cold | Warm | Hot |
+ */
+ if ((1 << fio->temp) & meta_flag)
+ op_flags |= REQ_META;
+ if ((1 << fio->temp) & fua_flag)
+ op_flags |= REQ_FUA;
+ return op_flags;
+}
+
static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages)
{
struct f2fs_sb_info *sbi = fio->sbi;
+ struct block_device *bdev;
+ sector_t sector;
struct bio *bio;
- bio = bio_alloc_bioset(GFP_NOIO, npages, &f2fs_bioset);
-
- f2fs_target_device(sbi, fio->new_blkaddr, bio);
+ bdev = f2fs_target_device(sbi, fio->new_blkaddr, &sector);
+ bio = bio_alloc_bioset(bdev, npages,
+ fio->op | fio->op_flags | f2fs_io_flags(fio),
+ GFP_NOIO, &f2fs_bioset);
+ bio->bi_iter.bi_sector = sector;
if (is_read_io(fio->op)) {
bio->bi_end_io = f2fs_read_end_io;
bio->bi_private = NULL;
} else {
bio->bi_end_io = f2fs_write_end_io;
bio->bi_private = sbi;
- bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi,
- fio->type, fio->temp);
}
iostat_alloc_and_bind_ctx(sbi, bio, NULL);
@@ -500,34 +532,6 @@ void f2fs_submit_bio(struct f2fs_sb_info *sbi,
__submit_bio(sbi, bio, type);
}
-static void __attach_io_flag(struct f2fs_io_info *fio)
-{
- struct f2fs_sb_info *sbi = fio->sbi;
- unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1;
- unsigned int io_flag, fua_flag, meta_flag;
-
- if (fio->type == DATA)
- io_flag = sbi->data_io_flag;
- else if (fio->type == NODE)
- io_flag = sbi->node_io_flag;
- else
- return;
-
- fua_flag = io_flag & temp_mask;
- meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask;
-
- /*
- * data/node io flag bits per temp:
- * REQ_META | REQ_FUA |
- * 5 | 4 | 3 | 2 | 1 | 0 |
- * Cold | Warm | Hot | Cold | Warm | Hot |
- */
- if ((1 << fio->temp) & meta_flag)
- fio->op_flags |= REQ_META;
- if ((1 << fio->temp) & fua_flag)
- fio->op_flags |= REQ_FUA;
-}
-
static void __submit_merged_bio(struct f2fs_bio_info *io)
{
struct f2fs_io_info *fio = &io->fio;
@@ -535,9 +539,6 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
if (!io->bio)
return;
- __attach_io_flag(fio);
- bio_set_op_attrs(io->bio, fio->op, fio->op_flags);
-
if (is_read_io(fio->op))
trace_f2fs_prepare_read_bio(io->sbi->sb, fio->type, io->bio);
else
@@ -590,18 +591,17 @@ static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi,
enum page_type btype = PAGE_TYPE_OF_BIO(type);
struct f2fs_bio_info *io = sbi->write_io[btype] + temp;
- down_write(&io->io_rwsem);
+ f2fs_down_write(&io->io_rwsem);
/* change META to META_FLUSH in the checkpoint procedure */
if (type >= META_FLUSH) {
io->fio.type = META_FLUSH;
- io->fio.op = REQ_OP_WRITE;
- io->fio.op_flags = REQ_META | REQ_PRIO | REQ_SYNC;
+ io->bio->bi_opf |= REQ_META | REQ_PRIO | REQ_SYNC;
if (!test_opt(sbi, NOBARRIER))
- io->fio.op_flags |= REQ_PREFLUSH | REQ_FUA;
+ io->bio->bi_opf |= REQ_PREFLUSH | REQ_FUA;
}
__submit_merged_bio(io);
- up_write(&io->io_rwsem);
+ f2fs_up_write(&io->io_rwsem);
}
static void __submit_merged_write_cond(struct f2fs_sb_info *sbi,
@@ -616,9 +616,9 @@ static void __submit_merged_write_cond(struct f2fs_sb_info *sbi,
enum page_type btype = PAGE_TYPE_OF_BIO(type);
struct f2fs_bio_info *io = sbi->write_io[btype] + temp;
- down_read(&io->io_rwsem);
+ f2fs_down_read(&io->io_rwsem);
ret = __has_merged_page(io->bio, inode, page, ino);
- up_read(&io->io_rwsem);
+ f2fs_up_read(&io->io_rwsem);
}
if (ret)
__f2fs_submit_merged_write(sbi, type, temp);
@@ -679,9 +679,6 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
if (fio->io_wbc && !is_read_io(fio->op))
wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE);
- __attach_io_flag(fio);
- bio_set_op_attrs(bio, fio->op, fio->op_flags);
-
inc_page_count(fio->sbi, is_read_io(fio->op) ?
__read_io_type(page): WB_DATA_TYPE(fio->page));
@@ -742,9 +739,9 @@ static void add_bio_entry(struct f2fs_sb_info *sbi, struct bio *bio,
if (bio_add_page(bio, page, PAGE_SIZE, 0) != PAGE_SIZE)
f2fs_bug_on(sbi, 1);
- down_write(&io->bio_list_lock);
+ f2fs_down_write(&io->bio_list_lock);
list_add_tail(&be->list, &io->bio_list);
- up_write(&io->bio_list_lock);
+ f2fs_up_write(&io->bio_list_lock);
}
static void del_bio_entry(struct bio_entry *be)
@@ -766,7 +763,7 @@ static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio,
struct list_head *head = &io->bio_list;
struct bio_entry *be;
- down_write(&io->bio_list_lock);
+ f2fs_down_write(&io->bio_list_lock);
list_for_each_entry(be, head, list) {
if (be->bio != *bio)
continue;
@@ -790,7 +787,7 @@ static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio,
__submit_bio(sbi, *bio, DATA);
break;
}
- up_write(&io->bio_list_lock);
+ f2fs_up_write(&io->bio_list_lock);
}
if (ret) {
@@ -816,7 +813,7 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi,
if (list_empty(head))
continue;
- down_read(&io->bio_list_lock);
+ f2fs_down_read(&io->bio_list_lock);
list_for_each_entry(be, head, list) {
if (target)
found = (target == be->bio);
@@ -826,14 +823,14 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi,
if (found)
break;
}
- up_read(&io->bio_list_lock);
+ f2fs_up_read(&io->bio_list_lock);
if (!found)
continue;
found = false;
- down_write(&io->bio_list_lock);
+ f2fs_down_write(&io->bio_list_lock);
list_for_each_entry(be, head, list) {
if (target)
found = (target == be->bio);
@@ -846,7 +843,7 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi,
break;
}
}
- up_write(&io->bio_list_lock);
+ f2fs_up_write(&io->bio_list_lock);
}
if (found)
@@ -875,10 +872,8 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio)
alloc_new:
if (!bio) {
bio = __bio_alloc(fio, BIO_MAX_VECS);
- __attach_io_flag(fio);
f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host,
fio->page->index, fio, GFP_NOIO);
- bio_set_op_attrs(bio, fio->op, fio->op_flags);
add_bio_entry(fio->sbi, bio, page, fio->temp);
} else {
@@ -906,7 +901,7 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio)
f2fs_bug_on(sbi, is_read_io(fio->op));
- down_write(&io->io_rwsem);
+ f2fs_down_write(&io->io_rwsem);
next:
if (fio->in_list) {
spin_lock(&io->io_lock);
@@ -973,7 +968,7 @@ out:
if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) ||
!f2fs_is_checkpoint_ready(sbi))
__submit_merged_bio(io);
- up_write(&io->io_rwsem);
+ f2fs_up_write(&io->io_rwsem);
}
static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
@@ -984,17 +979,17 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
struct bio *bio;
struct bio_post_read_ctx *ctx = NULL;
unsigned int post_read_steps = 0;
+ sector_t sector;
+ struct block_device *bdev = f2fs_target_device(sbi, blkaddr, &sector);
- bio = bio_alloc_bioset(for_write ? GFP_NOIO : GFP_KERNEL,
- bio_max_segs(nr_pages), &f2fs_bioset);
+ bio = bio_alloc_bioset(bdev, bio_max_segs(nr_pages),
+ REQ_OP_READ | op_flag,
+ for_write ? GFP_NOIO : GFP_KERNEL, &f2fs_bioset);
if (!bio)
return ERR_PTR(-ENOMEM);
-
+ bio->bi_iter.bi_sector = sector;
f2fs_set_bio_crypt_ctx(bio, inode, first_idx, NULL, GFP_NOFS);
-
- f2fs_target_device(sbi, blkaddr, bio);
bio->bi_end_io = f2fs_read_end_io;
- bio_set_op_attrs(bio, REQ_OP_READ, op_flag);
if (fscrypt_inode_uses_fs_layer_crypto(inode))
post_read_steps |= STEP_DECRYPT;
@@ -1383,9 +1378,9 @@ void f2fs_do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock)
{
if (flag == F2FS_GET_BLOCK_PRE_AIO) {
if (lock)
- down_read(&sbi->node_change);
+ f2fs_down_read(&sbi->node_change);
else
- up_read(&sbi->node_change);
+ f2fs_up_read(&sbi->node_change);
} else {
if (lock)
f2fs_lock_op(sbi);
@@ -2406,7 +2401,7 @@ static void f2fs_readahead(struct readahead_control *rac)
if (!f2fs_is_compress_backend_ready(inode))
return;
- /* If the file has inline data, skip readpages */
+ /* If the file has inline data, skip readahead */
if (f2fs_has_inline_data(inode))
return;
@@ -2460,6 +2455,9 @@ static inline bool check_inplace_update_policy(struct inode *inode,
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
unsigned int policy = SM_I(sbi)->ipu_policy;
+ if (policy & (0x1 << F2FS_IPU_HONOR_OPU_WRITE) &&
+ is_inode_flag_set(inode, FI_OPU_WRITE))
+ return false;
if (policy & (0x1 << F2FS_IPU_FORCE))
return true;
if (policy & (0x1 << F2FS_IPU_SSR) && f2fs_need_SSR(sbi))
@@ -2530,6 +2528,9 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio)
if (is_inode_flag_set(inode, FI_ALIGNED_WRITE))
return true;
+ if (is_inode_flag_set(inode, FI_OPU_WRITE))
+ return true;
+
if (fio) {
if (page_private_gcing(fio->page))
return true;
@@ -2749,13 +2750,13 @@ write:
* the below discard race condition.
*/
if (IS_NOQUOTA(inode))
- down_read(&sbi->node_write);
+ f2fs_down_read(&sbi->node_write);
fio.need_lock = LOCK_DONE;
err = f2fs_do_write_data_page(&fio);
if (IS_NOQUOTA(inode))
- up_read(&sbi->node_write);
+ f2fs_up_read(&sbi->node_write);
goto done;
}
@@ -3047,8 +3048,7 @@ result:
} else if (ret == -EAGAIN) {
ret = 0;
if (wbc->sync_mode == WB_SYNC_ALL) {
- cond_resched();
- congestion_wait(BLK_RW_ASYNC,
+ f2fs_io_schedule_timeout(
DEFAULT_IO_TIMEOUT);
goto retry_write;
}
@@ -3154,8 +3154,8 @@ static int __f2fs_write_data_pages(struct address_space *mapping,
f2fs_available_free_memory(sbi, DIRTY_DENTS))
goto skip_write;
- /* skip writing during file defragment */
- if (is_inode_flag_set(inode, FI_DO_DEFRAG))
+ /* skip writing in file defragment preparing stage */
+ if (is_inode_flag_set(inode, FI_SKIP_WRITES))
goto skip_write;
trace_f2fs_writepages(mapping->host, wbc, DATA);
@@ -3163,8 +3163,12 @@ static int __f2fs_write_data_pages(struct address_space *mapping,
/* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */
if (wbc->sync_mode == WB_SYNC_ALL)
atomic_inc(&sbi->wb_sync_req[DATA]);
- else if (atomic_read(&sbi->wb_sync_req[DATA]))
+ else if (atomic_read(&sbi->wb_sync_req[DATA])) {
+ /* to avoid potential deadlock */
+ if (current->plug)
+ blk_finish_plug(current->plug);
goto skip_write;
+ }
if (__should_serialize_io(inode, wbc)) {
mutex_lock(&sbi->writepages);
@@ -3213,14 +3217,14 @@ void f2fs_write_failed(struct inode *inode, loff_t to)
/* In the fs-verity case, f2fs_end_enable_verity() does the truncate */
if (to > i_size && !f2fs_verity_in_progress(inode)) {
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
truncate_pagecache(inode, i_size);
f2fs_truncate_blocks(inode, i_size, true);
filemap_invalidate_unlock(inode->i_mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
}
}
@@ -3353,7 +3357,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
*fsdata = NULL;
- if (len == PAGE_SIZE)
+ if (len == PAGE_SIZE && !(f2fs_is_atomic_file(inode)))
goto repeat;
ret = f2fs_prepare_compress_overwrite(inode, pagep,
@@ -3492,17 +3496,16 @@ unlock_out:
return copied;
}
-void f2fs_invalidate_page(struct page *page, unsigned int offset,
- unsigned int length)
+void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
if (inode->i_ino >= F2FS_ROOT_INO(sbi) &&
- (offset % PAGE_SIZE || length != PAGE_SIZE))
+ (offset || length != folio_size(folio)))
return;
- if (PageDirty(page)) {
+ if (folio_test_dirty(folio)) {
if (inode->i_ino == F2FS_META_INO(sbi)) {
dec_page_count(sbi, F2FS_DIRTY_META);
} else if (inode->i_ino == F2FS_NODE_INO(sbi)) {
@@ -3513,17 +3516,16 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
}
}
- clear_page_private_gcing(page);
+ clear_page_private_gcing(&folio->page);
if (test_opt(sbi, COMPRESS_CACHE) &&
inode->i_ino == F2FS_COMPRESS_INO(sbi))
- clear_page_private_data(page);
+ clear_page_private_data(&folio->page);
- if (page_private_atomic(page))
- return f2fs_drop_inmem_page(inode, page);
+ if (page_private_atomic(&folio->page))
+ return f2fs_drop_inmem_page(inode, &folio->page);
- detach_page_private(page);
- set_page_private(page, 0);
+ folio_detach_private(folio);
}
int f2fs_release_page(struct page *page, gfp_t wait)
@@ -3550,35 +3552,35 @@ int f2fs_release_page(struct page *page, gfp_t wait)
return 1;
}
-static int f2fs_set_data_page_dirty(struct page *page)
+static bool f2fs_dirty_data_folio(struct address_space *mapping,
+ struct folio *folio)
{
- struct inode *inode = page_file_mapping(page)->host;
+ struct inode *inode = mapping->host;
- trace_f2fs_set_page_dirty(page, DATA);
+ trace_f2fs_set_page_dirty(&folio->page, DATA);
- if (!PageUptodate(page))
- SetPageUptodate(page);
- if (PageSwapCache(page))
- return __set_page_dirty_nobuffers(page);
+ if (!folio_test_uptodate(folio))
+ folio_mark_uptodate(folio);
+ BUG_ON(folio_test_swapcache(folio));
if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) {
- if (!page_private_atomic(page)) {
- f2fs_register_inmem_page(inode, page);
- return 1;
+ if (!page_private_atomic(&folio->page)) {
+ f2fs_register_inmem_page(inode, &folio->page);
+ return true;
}
/*
* Previously, this page has been registered, we just
* return here.
*/
- return 0;
+ return false;
}
- if (!PageDirty(page)) {
- __set_page_dirty_nobuffers(page);
- f2fs_update_dirty_page(inode, page);
- return 1;
+ if (!folio_test_dirty(folio)) {
+ filemap_dirty_folio(mapping, folio);
+ f2fs_update_dirty_folio(inode, folio);
+ return true;
}
- return 0;
+ return false;
}
@@ -3721,19 +3723,20 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk,
unsigned int end_sec = secidx + blkcnt / blk_per_sec;
int ret = 0;
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
set_inode_flag(inode, FI_ALIGNED_WRITE);
+ set_inode_flag(inode, FI_OPU_WRITE);
for (; secidx < end_sec; secidx++) {
- down_write(&sbi->pin_sem);
+ f2fs_down_write(&sbi->pin_sem);
f2fs_lock_op(sbi);
f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false);
f2fs_unlock_op(sbi);
- set_inode_flag(inode, FI_DO_DEFRAG);
+ set_inode_flag(inode, FI_SKIP_WRITES);
for (blkofs = 0; blkofs < blk_per_sec; blkofs++) {
struct page *page;
@@ -3741,7 +3744,7 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk,
page = f2fs_get_lock_data_page(inode, blkidx, true);
if (IS_ERR(page)) {
- up_write(&sbi->pin_sem);
+ f2fs_up_write(&sbi->pin_sem);
ret = PTR_ERR(page);
goto done;
}
@@ -3750,22 +3753,23 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk,
f2fs_put_page(page, 1);
}
- clear_inode_flag(inode, FI_DO_DEFRAG);
+ clear_inode_flag(inode, FI_SKIP_WRITES);
ret = filemap_fdatawrite(inode->i_mapping);
- up_write(&sbi->pin_sem);
+ f2fs_up_write(&sbi->pin_sem);
if (ret)
break;
}
done:
- clear_inode_flag(inode, FI_DO_DEFRAG);
+ clear_inode_flag(inode, FI_SKIP_WRITES);
+ clear_inode_flag(inode, FI_OPU_WRITE);
clear_inode_flag(inode, FI_ALIGNED_WRITE);
filemap_invalidate_unlock(inode->i_mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
return ret;
}
@@ -3938,8 +3942,8 @@ const struct address_space_operations f2fs_dblock_aops = {
.writepages = f2fs_write_data_pages,
.write_begin = f2fs_write_begin,
.write_end = f2fs_write_end,
- .set_page_dirty = f2fs_set_data_page_dirty,
- .invalidatepage = f2fs_invalidate_page,
+ .dirty_folio = f2fs_dirty_data_folio,
+ .invalidate_folio = f2fs_invalidate_folio,
.releasepage = f2fs_release_page,
.direct_IO = noop_direct_IO,
.bmap = f2fs_bmap,
@@ -4044,6 +4048,13 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
iomap->offset = blks_to_bytes(inode, map.m_lblk);
+ /*
+ * When inline encryption is enabled, sometimes I/O to an encrypted file
+ * has to be broken up to guarantee DUN contiguity. Handle this by
+ * limiting the length of the mapping returned.
+ */
+ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len);
+
if (map.m_flags & (F2FS_MAP_MAPPED | F2FS_MAP_UNWRITTEN)) {
iomap->length = blks_to_bytes(inode, map.m_len);
if (map.m_flags & F2FS_MAP_MAPPED) {
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 8c50518475a9..fcdf253cd211 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -21,7 +21,7 @@
#include "gc.h"
static LIST_HEAD(f2fs_stat_list);
-static DEFINE_MUTEX(f2fs_stat_mutex);
+static DEFINE_RAW_SPINLOCK(f2fs_stat_lock);
#ifdef CONFIG_DEBUG_FS
static struct dentry *f2fs_debugfs_root;
#endif
@@ -338,14 +338,16 @@ static char *s_flag[] = {
[SBI_QUOTA_SKIP_FLUSH] = " quota_skip_flush",
[SBI_QUOTA_NEED_REPAIR] = " quota_need_repair",
[SBI_IS_RESIZEFS] = " resizefs",
+ [SBI_IS_FREEZING] = " freezefs",
};
static int stat_show(struct seq_file *s, void *v)
{
struct f2fs_stat_info *si;
int i = 0, j = 0;
+ unsigned long flags;
- mutex_lock(&f2fs_stat_mutex);
+ raw_spin_lock_irqsave(&f2fs_stat_lock, flags);
list_for_each_entry(si, &f2fs_stat_list, stat_list) {
update_general_status(si->sbi);
@@ -474,12 +476,14 @@ static int stat_show(struct seq_file *s, void *v)
si->node_segs, si->bg_node_segs);
seq_printf(s, " - Reclaimed segs : Normal (%d), Idle CB (%d), "
"Idle Greedy (%d), Idle AT (%d), "
- "Urgent High (%d), Urgent Low (%d)\n",
+ "Urgent High (%d), Urgent Mid (%d), "
+ "Urgent Low (%d)\n",
si->sbi->gc_reclaimed_segs[GC_NORMAL],
si->sbi->gc_reclaimed_segs[GC_IDLE_CB],
si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY],
si->sbi->gc_reclaimed_segs[GC_IDLE_AT],
si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH],
+ si->sbi->gc_reclaimed_segs[GC_URGENT_MID],
si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]);
seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks,
si->bg_data_blks + si->bg_node_blks);
@@ -532,6 +536,9 @@ static int stat_show(struct seq_file *s, void *v)
si->ndirty_meta, si->meta_pages);
seq_printf(s, " - imeta: %4d\n",
si->ndirty_imeta);
+ seq_printf(s, " - fsync mark: %4lld\n",
+ percpu_counter_sum_positive(
+ &si->sbi->rf_node_block_count));
seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n",
si->dirty_nats, si->nats, si->dirty_sits, si->sits);
seq_printf(s, " - free_nids: %9d/%9d\n - alloc_nids: %9d\n",
@@ -573,7 +580,7 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, " - paged : %llu KB\n",
si->page_mem >> 10);
}
- mutex_unlock(&f2fs_stat_mutex);
+ raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags);
return 0;
}
@@ -584,6 +591,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
{
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
struct f2fs_stat_info *si;
+ unsigned long flags;
int i;
si = f2fs_kzalloc(sbi, sizeof(struct f2fs_stat_info), GFP_KERNEL);
@@ -619,9 +627,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
atomic_set(&sbi->max_aw_cnt, 0);
atomic_set(&sbi->max_vw_cnt, 0);
- mutex_lock(&f2fs_stat_mutex);
+ raw_spin_lock_irqsave(&f2fs_stat_lock, flags);
list_add_tail(&si->stat_list, &f2fs_stat_list);
- mutex_unlock(&f2fs_stat_mutex);
+ raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags);
return 0;
}
@@ -629,10 +637,11 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
{
struct f2fs_stat_info *si = F2FS_STAT(sbi);
+ unsigned long flags;
- mutex_lock(&f2fs_stat_mutex);
+ raw_spin_lock_irqsave(&f2fs_stat_lock, flags);
list_del(&si->stat_list);
- mutex_unlock(&f2fs_stat_mutex);
+ raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags);
kfree(si);
}
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 166f08623362..a0e51937d92e 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -766,7 +766,7 @@ add_dentry:
f2fs_wait_on_page_writeback(dentry_page, DATA, true, true);
if (inode) {
- down_write(&F2FS_I(inode)->i_sem);
+ f2fs_down_write(&F2FS_I(inode)->i_sem);
page = f2fs_init_inode_metadata(inode, dir, fname, NULL);
if (IS_ERR(page)) {
err = PTR_ERR(page);
@@ -793,7 +793,7 @@ add_dentry:
f2fs_update_parent_metadata(dir, inode, current_depth);
fail:
if (inode)
- up_write(&F2FS_I(inode)->i_sem);
+ f2fs_up_write(&F2FS_I(inode)->i_sem);
f2fs_put_page(dentry_page, 1);
@@ -858,7 +858,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
struct page *page;
int err = 0;
- down_write(&F2FS_I(inode)->i_sem);
+ f2fs_down_write(&F2FS_I(inode)->i_sem);
page = f2fs_init_inode_metadata(inode, dir, NULL, NULL);
if (IS_ERR(page)) {
err = PTR_ERR(page);
@@ -869,7 +869,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
clear_inode_flag(inode, FI_NEW_INODE);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
fail:
- up_write(&F2FS_I(inode)->i_sem);
+ f2fs_up_write(&F2FS_I(inode)->i_sem);
return err;
}
@@ -877,7 +877,7 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
- down_write(&F2FS_I(inode)->i_sem);
+ f2fs_down_write(&F2FS_I(inode)->i_sem);
if (S_ISDIR(inode->i_mode))
f2fs_i_links_write(dir, false);
@@ -888,7 +888,7 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode)
f2fs_i_links_write(inode, false);
f2fs_i_size_write(inode, 0);
}
- up_write(&F2FS_I(inode)->i_sem);
+ f2fs_up_write(&F2FS_I(inode)->i_sem);
if (inode->i_nlink == 0)
f2fs_add_orphan_inode(inode);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 68b44015514f..8c570de21ed5 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -123,6 +123,20 @@ typedef u32 nid_t;
#define COMPRESS_EXT_NUM 16
+/*
+ * An implementation of an rwsem that is explicitly unfair to readers. This
+ * prevents priority inversion when a low-priority reader acquires the read lock
+ * while sleeping on the write lock but the write lock is needed by
+ * higher-priority clients.
+ */
+
+struct f2fs_rwsem {
+ struct rw_semaphore internal_rwsem;
+#ifdef CONFIG_F2FS_UNFAIR_RWSEM
+ wait_queue_head_t read_waiters;
+#endif
+};
+
struct f2fs_mount_info {
unsigned int opt;
int write_io_size_bits; /* Write IO size bits */
@@ -140,7 +154,6 @@ struct f2fs_mount_info {
int s_jquota_fmt; /* Format of quota to use */
#endif
/* For which write hints are passed down to block layer */
- int whint_mode;
int alloc_mode; /* segment allocation policy */
int fsync_mode; /* fsync policy */
int fs_mode; /* fs mode: LFS or ADAPTIVE */
@@ -386,6 +399,10 @@ struct discard_cmd_control {
struct mutex cmd_lock;
unsigned int nr_discards; /* # of discards in the list */
unsigned int max_discards; /* max. discards to be issued */
+ unsigned int max_discard_request; /* max. discard request per round */
+ unsigned int min_discard_issue_time; /* min. interval between discard issue */
+ unsigned int mid_discard_issue_time; /* mid. interval between discard issue */
+ unsigned int max_discard_issue_time; /* max. interval between discard issue */
unsigned int discard_granularity; /* discard granularity */
unsigned int undiscard_blks; /* # of undiscard blocks */
unsigned int next_pos; /* next discard position */
@@ -561,6 +578,9 @@ enum {
/* maximum retry quota flush count */
#define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8
+/* maximum retry of EIO'ed meta page */
+#define MAX_RETRY_META_PAGE_EIO 100
+
#define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */
#define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */
@@ -574,6 +594,9 @@ enum {
/* number of extent info in extent cache we try to shrink */
#define EXTENT_CACHE_SHRINK_NUMBER 128
+#define RECOVERY_MAX_RA_BLOCKS BIO_MAX_VECS
+#define RECOVERY_MIN_RA_BLOCKS 1
+
struct rb_entry {
struct rb_node rb_node; /* rb node located in rb-tree */
union {
@@ -721,7 +744,8 @@ enum {
FI_DROP_CACHE, /* drop dirty page cache */
FI_DATA_EXIST, /* indicate data exists */
FI_INLINE_DOTS, /* indicate inline dot dentries */
- FI_DO_DEFRAG, /* indicate defragment is running */
+ FI_SKIP_WRITES, /* should skip data page writeback */
+ FI_OPU_WRITE, /* used for opu per file */
FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */
FI_PREALLOCATED_ALL, /* all blocks for write were preallocated */
FI_HOT_DATA, /* indicate file is hot */
@@ -752,7 +776,7 @@ struct f2fs_inode_info {
/* Use below internally in f2fs*/
unsigned long flags[BITS_TO_LONGS(FI_MAX)]; /* use to pass per-file flags */
- struct rw_semaphore i_sem; /* protect fi info */
+ struct f2fs_rwsem i_sem; /* protect fi info */
atomic_t dirty_pages; /* # of dirty pages */
f2fs_hash_t chash; /* hash value of given file name */
unsigned int clevel; /* maximum level of given file name */
@@ -777,8 +801,8 @@ struct f2fs_inode_info {
struct extent_tree *extent_tree; /* cached extent_tree entry */
/* avoid racing between foreground op and gc */
- struct rw_semaphore i_gc_rwsem[2];
- struct rw_semaphore i_xattr_sem; /* avoid racing between reading and changing EAs */
+ struct f2fs_rwsem i_gc_rwsem[2];
+ struct f2fs_rwsem i_xattr_sem; /* avoid racing between reading and changing EAs */
int i_extra_isize; /* size of extra space located in i_addr */
kprojid_t i_projid; /* id for project quota */
@@ -897,6 +921,7 @@ struct f2fs_nm_info {
nid_t max_nid; /* maximum possible node ids */
nid_t available_nids; /* # of available node ids */
nid_t next_scan_nid; /* the next nid to be scanned */
+ nid_t max_rf_node_blocks; /* max # of nodes for recovery */
unsigned int ram_thresh; /* control the memory footprint */
unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */
unsigned int dirty_nats_ratio; /* control dirty nats ratio threshold */
@@ -904,7 +929,7 @@ struct f2fs_nm_info {
/* NAT cache management */
struct radix_tree_root nat_root;/* root of the nat entry cache */
struct radix_tree_root nat_set_root;/* root of the nat set cache */
- struct rw_semaphore nat_tree_lock; /* protect nat entry tree */
+ struct f2fs_rwsem nat_tree_lock; /* protect nat entry tree */
struct list_head nat_entries; /* cached nat entry list (clean) */
spinlock_t nat_list_lock; /* protect clean nat entry list */
unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */
@@ -1017,7 +1042,7 @@ struct f2fs_sm_info {
struct dirty_seglist_info *dirty_info; /* dirty segment information */
struct curseg_info *curseg_array; /* active segment information */
- struct rw_semaphore curseg_lock; /* for preventing curseg change */
+ struct f2fs_rwsem curseg_lock; /* for preventing curseg change */
block_t seg0_blkaddr; /* block address of 0'th segment */
block_t main_blkaddr; /* start block address of main area */
@@ -1201,11 +1226,11 @@ struct f2fs_bio_info {
struct bio *bio; /* bios to merge */
sector_t last_block_in_bio; /* last block number */
struct f2fs_io_info fio; /* store buffered io info. */
- struct rw_semaphore io_rwsem; /* blocking op for bio */
+ struct f2fs_rwsem io_rwsem; /* blocking op for bio */
spinlock_t io_lock; /* serialize DATA/NODE IOs */
struct list_head io_list; /* track fios */
struct list_head bio_list; /* bio entry list head */
- struct rw_semaphore bio_list_lock; /* lock to protect bio entry list */
+ struct f2fs_rwsem bio_list_lock; /* lock to protect bio entry list */
};
#define FDEV(i) (sbi->devs[i])
@@ -1267,6 +1292,7 @@ enum {
SBI_QUOTA_SKIP_FLUSH, /* skip flushing quota in current CP */
SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */
SBI_IS_RESIZEFS, /* resizefs is in process */
+ SBI_IS_FREEZING, /* freezefs is in process */
};
enum {
@@ -1286,6 +1312,7 @@ enum {
GC_IDLE_AT,
GC_URGENT_HIGH,
GC_URGENT_LOW,
+ GC_URGENT_MID,
MAX_GC_MODE,
};
@@ -1306,12 +1333,6 @@ enum {
};
enum {
- WHINT_MODE_OFF, /* not pass down write hints */
- WHINT_MODE_USER, /* try to pass down hints given by users */
- WHINT_MODE_FS, /* pass down hints with F2FS policy */
-};
-
-enum {
ALLOC_MODE_DEFAULT, /* stay default */
ALLOC_MODE_REUSE, /* reuse segments as much as possible */
};
@@ -1571,7 +1592,7 @@ struct f2fs_sb_info {
struct super_block *sb; /* pointer to VFS super block */
struct proc_dir_entry *s_proc; /* proc entry */
struct f2fs_super_block *raw_super; /* raw super block pointer */
- struct rw_semaphore sb_lock; /* lock for raw super block */
+ struct f2fs_rwsem sb_lock; /* lock for raw super block */
int valid_super_block; /* valid super block no */
unsigned long s_flag; /* flags for sbi */
struct mutex writepages; /* mutex for writepages() */
@@ -1591,18 +1612,20 @@ struct f2fs_sb_info {
/* for bio operations */
struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */
/* keep migration IO order for LFS mode */
- struct rw_semaphore io_order_lock;
+ struct f2fs_rwsem io_order_lock;
mempool_t *write_io_dummy; /* Dummy pages */
+ pgoff_t metapage_eio_ofs; /* EIO page offset */
+ int metapage_eio_cnt; /* EIO count */
/* for checkpoint */
struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */
int cur_cp_pack; /* remain current cp pack */
spinlock_t cp_lock; /* for flag in ckpt */
struct inode *meta_inode; /* cache meta blocks */
- struct rw_semaphore cp_global_sem; /* checkpoint procedure lock */
- struct rw_semaphore cp_rwsem; /* blocking FS operations */
- struct rw_semaphore node_write; /* locking node writes */
- struct rw_semaphore node_change; /* locking node change */
+ struct f2fs_rwsem cp_global_sem; /* checkpoint procedure lock */
+ struct f2fs_rwsem cp_rwsem; /* blocking FS operations */
+ struct f2fs_rwsem node_write; /* locking node writes */
+ struct f2fs_rwsem node_change; /* locking node change */
wait_queue_head_t cp_wait;
unsigned long last_time[MAX_TIME]; /* to store time in jiffies */
long interval_time[MAX_TIME]; /* to store thresholds */
@@ -1662,12 +1685,14 @@ struct f2fs_sb_info {
block_t unusable_block_count; /* # of blocks saved by last cp */
unsigned int nquota_files; /* # of quota sysfile */
- struct rw_semaphore quota_sem; /* blocking cp for flags */
+ struct f2fs_rwsem quota_sem; /* blocking cp for flags */
/* # of pages, see count_type */
atomic_t nr_pages[NR_COUNT_TYPE];
/* # of allocated blocks */
struct percpu_counter alloc_valid_block_count;
+ /* # of node block writes as roll forward recovery */
+ struct percpu_counter rf_node_block_count;
/* writeback control */
atomic_t wb_sync_req[META]; /* count # of WB_SYNC threads */
@@ -1678,7 +1703,7 @@ struct f2fs_sb_info {
struct f2fs_mount_info mount_opt; /* mount options */
/* for cleaning operations */
- struct rw_semaphore gc_lock; /*
+ struct f2fs_rwsem gc_lock; /*
* semaphore for GC, avoid
* race between GC and GC or CP
*/
@@ -1698,7 +1723,7 @@ struct f2fs_sb_info {
/* threshold for gc trials on pinned files */
u64 gc_pin_file_threshold;
- struct rw_semaphore pin_sem;
+ struct f2fs_rwsem pin_sem;
/* maximum # of trials to find a victim segment for SSR and GC */
unsigned int max_victim_search;
@@ -2092,9 +2117,81 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f)
spin_unlock_irqrestore(&sbi->cp_lock, flags);
}
+#define init_f2fs_rwsem(sem) \
+do { \
+ static struct lock_class_key __key; \
+ \
+ __init_f2fs_rwsem((sem), #sem, &__key); \
+} while (0)
+
+static inline void __init_f2fs_rwsem(struct f2fs_rwsem *sem,
+ const char *sem_name, struct lock_class_key *key)
+{
+ __init_rwsem(&sem->internal_rwsem, sem_name, key);
+#ifdef CONFIG_F2FS_UNFAIR_RWSEM
+ init_waitqueue_head(&sem->read_waiters);
+#endif
+}
+
+static inline int f2fs_rwsem_is_locked(struct f2fs_rwsem *sem)
+{
+ return rwsem_is_locked(&sem->internal_rwsem);
+}
+
+static inline int f2fs_rwsem_is_contended(struct f2fs_rwsem *sem)
+{
+ return rwsem_is_contended(&sem->internal_rwsem);
+}
+
+static inline void f2fs_down_read(struct f2fs_rwsem *sem)
+{
+#ifdef CONFIG_F2FS_UNFAIR_RWSEM
+ wait_event(sem->read_waiters, down_read_trylock(&sem->internal_rwsem));
+#else
+ down_read(&sem->internal_rwsem);
+#endif
+}
+
+static inline int f2fs_down_read_trylock(struct f2fs_rwsem *sem)
+{
+ return down_read_trylock(&sem->internal_rwsem);
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static inline void f2fs_down_read_nested(struct f2fs_rwsem *sem, int subclass)
+{
+ down_read_nested(&sem->internal_rwsem, subclass);
+}
+#else
+#define f2fs_down_read_nested(sem, subclass) f2fs_down_read(sem)
+#endif
+
+static inline void f2fs_up_read(struct f2fs_rwsem *sem)
+{
+ up_read(&sem->internal_rwsem);
+}
+
+static inline void f2fs_down_write(struct f2fs_rwsem *sem)
+{
+ down_write(&sem->internal_rwsem);
+}
+
+static inline int f2fs_down_write_trylock(struct f2fs_rwsem *sem)
+{
+ return down_write_trylock(&sem->internal_rwsem);
+}
+
+static inline void f2fs_up_write(struct f2fs_rwsem *sem)
+{
+ up_write(&sem->internal_rwsem);
+#ifdef CONFIG_F2FS_UNFAIR_RWSEM
+ wake_up_all(&sem->read_waiters);
+#endif
+}
+
static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
{
- down_read(&sbi->cp_rwsem);
+ f2fs_down_read(&sbi->cp_rwsem);
}
static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi)
@@ -2103,22 +2200,22 @@ static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi)
f2fs_show_injection_info(sbi, FAULT_LOCK_OP);
return 0;
}
- return down_read_trylock(&sbi->cp_rwsem);
+ return f2fs_down_read_trylock(&sbi->cp_rwsem);
}
static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
{
- up_read(&sbi->cp_rwsem);
+ f2fs_up_read(&sbi->cp_rwsem);
}
static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
{
- down_write(&sbi->cp_rwsem);
+ f2fs_down_write(&sbi->cp_rwsem);
}
static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
{
- up_write(&sbi->cp_rwsem);
+ f2fs_up_write(&sbi->cp_rwsem);
}
static inline int __get_cp_reason(struct f2fs_sb_info *sbi)
@@ -2681,6 +2778,9 @@ static inline bool is_idle(struct f2fs_sb_info *sbi, int type)
if (is_inflight_io(sbi, type))
return false;
+ if (sbi->gc_mode == GC_URGENT_MID)
+ return true;
+
if (sbi->gc_mode == GC_URGENT_LOW &&
(type == DISCARD_TIME || type == GC_TIME))
return true;
@@ -3550,8 +3650,6 @@ void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi);
int __init f2fs_create_segment_manager_caches(void);
void f2fs_destroy_segment_manager_caches(void);
int f2fs_rw_hint_to_seg_type(enum rw_hint hint);
-enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi,
- enum page_type type, enum temp_type temp);
unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi,
unsigned int segno);
unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi,
@@ -3579,7 +3677,8 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
block_t blkaddr, int type);
int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
int type, bool sync);
-void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index);
+void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index,
+ unsigned int ra_blocks);
long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
long nr_to_write, enum iostat_type io_type);
void f2fs_add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type);
@@ -3597,7 +3696,7 @@ void f2fs_add_orphan_inode(struct inode *inode);
void f2fs_remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino);
int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi);
int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi);
-void f2fs_update_dirty_page(struct inode *inode, struct page *page);
+void f2fs_update_dirty_folio(struct inode *inode, struct folio *folio);
void f2fs_remove_dirty_inode(struct inode *inode);
int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type);
void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type);
@@ -3631,7 +3730,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio);
int f2fs_merge_page_bio(struct f2fs_io_info *fio);
void f2fs_submit_page_write(struct f2fs_io_info *fio);
struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
- block_t blk_addr, struct bio *bio);
+ block_t blk_addr, sector_t *sector);
int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr);
void f2fs_set_data_blkaddr(struct dnode_of_data *dn);
void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr);
@@ -3661,8 +3760,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
enum iostat_type io_type,
int compr_blocks, bool allow_balance);
void f2fs_write_failed(struct inode *inode, loff_t to);
-void f2fs_invalidate_page(struct page *page, unsigned int offset,
- unsigned int length);
+void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length);
int f2fs_release_page(struct page *page, gfp_t wait);
#ifdef CONFIG_MIGRATION
int f2fs_migrate_page(struct address_space *mapping, struct page *newpage,
@@ -4371,7 +4469,11 @@ static inline bool f2fs_force_buffered_io(struct inode *inode,
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
int rw = iov_iter_rw(iter);
- if (f2fs_post_read_required(inode))
+ if (!fscrypt_dio_supported(iocb, iter))
+ return true;
+ if (fsverity_active(inode))
+ return true;
+ if (f2fs_compressed_file(inode))
return true;
/* disallow direct IO if any of devices has unaligned blksize */
@@ -4426,6 +4528,12 @@ static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi)
return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK;
}
+static inline void f2fs_io_schedule_timeout(long timeout)
+{
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ io_schedule_timeout(timeout);
+}
+
#define EFSBADCRC EBADMSG /* Bad CRC detected */
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 3c98ef6af97d..5b89af0f27f0 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -237,13 +237,13 @@ static void try_to_fix_pino(struct inode *inode)
struct f2fs_inode_info *fi = F2FS_I(inode);
nid_t pino;
- down_write(&fi->i_sem);
+ f2fs_down_write(&fi->i_sem);
if (file_wrong_pino(inode) && inode->i_nlink == 1 &&
get_parent_ino(inode, &pino)) {
f2fs_i_pino_write(inode, pino);
file_got_pino(inode);
}
- up_write(&fi->i_sem);
+ f2fs_up_write(&fi->i_sem);
}
static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
@@ -318,9 +318,9 @@ go_write:
* Both of fdatasync() and fsync() are able to be recovered from
* sudden-power-off.
*/
- down_read(&F2FS_I(inode)->i_sem);
+ f2fs_down_read(&F2FS_I(inode)->i_sem);
cp_reason = need_do_checkpoint(inode);
- up_read(&F2FS_I(inode)->i_sem);
+ f2fs_up_read(&F2FS_I(inode)->i_sem);
if (cp_reason) {
/* all the dirty node pages should be flushed for POR */
@@ -812,7 +812,7 @@ int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path,
{
struct inode *inode = d_inode(path->dentry);
struct f2fs_inode_info *fi = F2FS_I(inode);
- struct f2fs_inode *ri;
+ struct f2fs_inode *ri = NULL;
unsigned int flags;
if (f2fs_has_extra_attr(inode) &&
@@ -844,7 +844,7 @@ int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path,
STATX_ATTR_NODUMP |
STATX_ATTR_VERITY);
- generic_fillattr(&init_user_ns, inode, stat);
+ generic_fillattr(mnt_userns, inode, stat);
/* we need to show initial sectors used for inline_data/dentries */
if ((S_ISREG(inode->i_mode) && f2fs_has_inline_data(inode)) ||
@@ -904,7 +904,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
!f2fs_is_compress_backend_ready(inode))
return -EOPNOTSUPP;
- err = setattr_prepare(&init_user_ns, dentry, attr);
+ err = setattr_prepare(mnt_userns, dentry, attr);
if (err)
return err;
@@ -958,7 +958,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
return err;
}
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
truncate_setsize(inode, attr->ia_size);
@@ -970,7 +970,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
* larger than i_size.
*/
filemap_invalidate_unlock(inode->i_mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
if (err)
return err;
@@ -980,10 +980,10 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
spin_unlock(&F2FS_I(inode)->i_size_lock);
}
- __setattr_copy(&init_user_ns, inode, attr);
+ __setattr_copy(mnt_userns, inode, attr);
if (attr->ia_valid & ATTR_MODE) {
- err = posix_acl_chmod(&init_user_ns, inode, f2fs_get_inode_mode(inode));
+ err = posix_acl_chmod(mnt_userns, inode, f2fs_get_inode_mode(inode));
if (is_inode_flag_set(inode, FI_ACL_MODE)) {
if (!err)
@@ -1112,7 +1112,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
blk_start = (loff_t)pg_start << PAGE_SHIFT;
blk_end = (loff_t)pg_end << PAGE_SHIFT;
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
truncate_pagecache_range(inode, blk_start, blk_end - 1);
@@ -1122,7 +1122,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
f2fs_unlock_op(sbi);
filemap_invalidate_unlock(inode->i_mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
}
}
@@ -1355,7 +1355,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len)
f2fs_balance_fs(sbi, true);
/* avoid gc operation during block exchange */
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
f2fs_lock_op(sbi);
@@ -1365,7 +1365,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len)
f2fs_unlock_op(sbi);
filemap_invalidate_unlock(inode->i_mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
return ret;
}
@@ -1500,7 +1500,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
unsigned int end_offset;
pgoff_t end;
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(mapping);
truncate_pagecache_range(inode,
@@ -1514,7 +1514,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
if (ret) {
f2fs_unlock_op(sbi);
filemap_invalidate_unlock(mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
goto out;
}
@@ -1526,7 +1526,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
f2fs_unlock_op(sbi);
filemap_invalidate_unlock(mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
f2fs_balance_fs(sbi, dn.node_changed);
@@ -1600,7 +1600,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
/* avoid gc operation during block exchange */
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(mapping);
truncate_pagecache(inode, offset);
@@ -1618,7 +1618,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
f2fs_unlock_op(sbi);
}
filemap_invalidate_unlock(mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
/* write out all moved pages, if possible */
filemap_invalidate_lock(mapping);
@@ -1674,13 +1674,13 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
next_alloc:
if (has_not_enough_free_secs(sbi, 0,
GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) {
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
err = f2fs_gc(sbi, true, false, false, NULL_SEGNO);
if (err && err != -ENODATA && err != -EAGAIN)
goto out_err;
}
- down_write(&sbi->pin_sem);
+ f2fs_down_write(&sbi->pin_sem);
f2fs_lock_op(sbi);
f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false);
@@ -1690,7 +1690,7 @@ next_alloc:
err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
file_dont_truncate(inode);
- up_write(&sbi->pin_sem);
+ f2fs_up_write(&sbi->pin_sem);
expanded += map.m_len;
sec_len -= map.m_len;
@@ -1989,11 +1989,12 @@ static int f2fs_ioc_getversion(struct file *filp, unsigned long arg)
static int f2fs_ioc_start_atomic_write(struct file *filp)
{
struct inode *inode = file_inode(filp);
+ struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
int ret;
- if (!inode_owner_or_capable(&init_user_ns, inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EACCES;
if (!S_ISREG(inode->i_mode))
@@ -2008,7 +2009,10 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
inode_lock(inode);
- f2fs_disable_compressed_file(inode);
+ if (!f2fs_disable_compressed_file(inode)) {
+ ret = -EINVAL;
+ goto out;
+ }
if (f2fs_is_atomic_file(inode)) {
if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST))
@@ -2020,7 +2024,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
if (ret)
goto out;
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
/*
* Should wait end_io to count F2FS_WB_CP_DATA correctly by
@@ -2031,7 +2035,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
inode->i_ino, get_dirty_pages(inode));
ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
if (ret) {
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
goto out;
}
@@ -2044,7 +2048,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
/* add inode in inmem_list first and set atomic_file */
set_inode_flag(inode, FI_ATOMIC_FILE);
clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
F2FS_I(inode)->inmem_task = current;
@@ -2058,9 +2062,10 @@ out:
static int f2fs_ioc_commit_atomic_write(struct file *filp)
{
struct inode *inode = file_inode(filp);
+ struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
int ret;
- if (!inode_owner_or_capable(&init_user_ns, inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EACCES;
ret = mnt_want_write_file(filp);
@@ -2100,9 +2105,10 @@ err_out:
static int f2fs_ioc_start_volatile_write(struct file *filp)
{
struct inode *inode = file_inode(filp);
+ struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
int ret;
- if (!inode_owner_or_capable(&init_user_ns, inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EACCES;
if (!S_ISREG(inode->i_mode))
@@ -2135,9 +2141,10 @@ out:
static int f2fs_ioc_release_volatile_write(struct file *filp)
{
struct inode *inode = file_inode(filp);
+ struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
int ret;
- if (!inode_owner_or_capable(&init_user_ns, inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EACCES;
ret = mnt_want_write_file(filp);
@@ -2164,9 +2171,10 @@ out:
static int f2fs_ioc_abort_volatile_write(struct file *filp)
{
struct inode *inode = file_inode(filp);
+ struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
int ret;
- if (!inode_owner_or_capable(&init_user_ns, inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EACCES;
ret = mnt_want_write_file(filp);
@@ -2351,7 +2359,7 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
if (err)
return err;
- down_write(&sbi->sb_lock);
+ f2fs_down_write(&sbi->sb_lock);
if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt))
goto got_it;
@@ -2370,7 +2378,7 @@ got_it:
16))
err = -EFAULT;
out_err:
- up_write(&sbi->sb_lock);
+ f2fs_up_write(&sbi->sb_lock);
mnt_drop_write_file(filp);
return err;
}
@@ -2447,12 +2455,12 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
return ret;
if (!sync) {
- if (!down_write_trylock(&sbi->gc_lock)) {
+ if (!f2fs_down_write_trylock(&sbi->gc_lock)) {
ret = -EBUSY;
goto out;
}
} else {
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
}
ret = f2fs_gc(sbi, sync, true, false, NULL_SEGNO);
@@ -2483,12 +2491,12 @@ static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range)
do_more:
if (!range->sync) {
- if (!down_write_trylock(&sbi->gc_lock)) {
+ if (!f2fs_down_write_trylock(&sbi->gc_lock)) {
ret = -EBUSY;
goto out;
}
} else {
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
}
ret = f2fs_gc(sbi, range->sync, true, false,
@@ -2559,10 +2567,6 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
bool fragmented = false;
int err;
- /* if in-place-update policy is enabled, don't waste time here */
- if (f2fs_should_update_inplace(inode, NULL))
- return -EINVAL;
-
pg_start = range->start >> PAGE_SHIFT;
pg_end = (range->start + range->len) >> PAGE_SHIFT;
@@ -2570,6 +2574,13 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
inode_lock(inode);
+ /* if in-place-update policy is enabled, don't waste time here */
+ set_inode_flag(inode, FI_OPU_WRITE);
+ if (f2fs_should_update_inplace(inode, NULL)) {
+ err = -EINVAL;
+ goto out;
+ }
+
/* writeback all dirty pages in the range */
err = filemap_write_and_wait_range(inode->i_mapping, range->start,
range->start + range->len - 1);
@@ -2651,7 +2662,7 @@ do_map:
goto check;
}
- set_inode_flag(inode, FI_DO_DEFRAG);
+ set_inode_flag(inode, FI_SKIP_WRITES);
idx = map.m_lblk;
while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) {
@@ -2676,15 +2687,16 @@ check:
if (map.m_lblk < pg_end && cnt < blk_per_seg)
goto do_map;
- clear_inode_flag(inode, FI_DO_DEFRAG);
+ clear_inode_flag(inode, FI_SKIP_WRITES);
err = filemap_fdatawrite(inode->i_mapping);
if (err)
goto out;
}
clear_out:
- clear_inode_flag(inode, FI_DO_DEFRAG);
+ clear_inode_flag(inode, FI_SKIP_WRITES);
out:
+ clear_inode_flag(inode, FI_OPU_WRITE);
inode_unlock(inode);
if (!err)
range->len = (u64)total << PAGE_SHIFT;
@@ -2820,10 +2832,10 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
f2fs_balance_fs(sbi, true);
- down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]);
if (src != dst) {
ret = -EBUSY;
- if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE]))
+ if (!f2fs_down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE]))
goto out_src;
}
@@ -2841,9 +2853,9 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
f2fs_unlock_op(sbi);
if (src != dst)
- up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]);
out_src:
- up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]);
out_unlock:
if (src != dst)
inode_unlock(dst);
@@ -2938,7 +2950,7 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg)
end_segno = min(start_segno + range.segments, dev_end_segno);
while (start_segno < end_segno) {
- if (!down_write_trylock(&sbi->gc_lock)) {
+ if (!f2fs_down_write_trylock(&sbi->gc_lock)) {
ret = -EBUSY;
goto out;
}
@@ -2990,7 +3002,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid)
{
struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct page *ipage;
+ struct f2fs_inode *ri = NULL;
kprojid_t kprojid;
int err;
@@ -3014,17 +3026,8 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid)
if (IS_NOQUOTA(inode))
return err;
- ipage = f2fs_get_node_page(sbi, inode->i_ino);
- if (IS_ERR(ipage))
- return PTR_ERR(ipage);
-
- if (!F2FS_FITS_IN_INODE(F2FS_INODE(ipage), fi->i_extra_isize,
- i_projid)) {
- err = -EOVERFLOW;
- f2fs_put_page(ipage, 1);
- return err;
- }
- f2fs_put_page(ipage, 1);
+ if (!F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid))
+ return -EOVERFLOW;
err = f2fs_dquot_initialize(inode);
if (err)
@@ -3215,9 +3218,9 @@ int f2fs_precache_extents(struct inode *inode)
while (map.m_lblk < end) {
map.m_len = end - map.m_lblk;
- down_write(&fi->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&fi->i_gc_rwsem[WRITE]);
err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_PRECACHE);
- up_write(&fi->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
if (err)
return err;
@@ -3294,11 +3297,11 @@ static int f2fs_ioc_getfslabel(struct file *filp, unsigned long arg)
if (!vbuf)
return -ENOMEM;
- down_read(&sbi->sb_lock);
+ f2fs_down_read(&sbi->sb_lock);
count = utf16s_to_utf8s(sbi->raw_super->volume_name,
ARRAY_SIZE(sbi->raw_super->volume_name),
UTF16_LITTLE_ENDIAN, vbuf, MAX_VOLUME_NAME);
- up_read(&sbi->sb_lock);
+ f2fs_up_read(&sbi->sb_lock);
if (copy_to_user((char __user *)arg, vbuf,
min(FSLABEL_MAX, count)))
@@ -3326,7 +3329,7 @@ static int f2fs_ioc_setfslabel(struct file *filp, unsigned long arg)
if (err)
goto out;
- down_write(&sbi->sb_lock);
+ f2fs_down_write(&sbi->sb_lock);
memset(sbi->raw_super->volume_name, 0,
sizeof(sbi->raw_super->volume_name));
@@ -3336,7 +3339,7 @@ static int f2fs_ioc_setfslabel(struct file *filp, unsigned long arg)
err = f2fs_commit_super(sbi, false);
- up_write(&sbi->sb_lock);
+ f2fs_up_write(&sbi->sb_lock);
mnt_drop_write_file(filp);
out:
@@ -3462,7 +3465,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
if (!atomic_read(&F2FS_I(inode)->i_compr_blocks))
goto out;
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
@@ -3499,7 +3502,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
}
filemap_invalidate_unlock(inode->i_mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
out:
inode_unlock(inode);
@@ -3615,7 +3618,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
goto unlock_inode;
}
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
@@ -3652,7 +3655,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
}
filemap_invalidate_unlock(inode->i_mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
if (ret >= 0) {
clear_inode_flag(inode, FI_COMPRESS_RELEASED);
@@ -3770,7 +3773,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg)
if (ret)
goto err;
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(mapping);
ret = filemap_write_and_wait_range(mapping, range.start,
@@ -3859,7 +3862,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg)
prev_block, len, range.flags);
out:
filemap_invalidate_unlock(mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
err:
inode_unlock(inode);
file_end_write(filp);
@@ -4291,12 +4294,12 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
trace_f2fs_direct_IO_enter(inode, iocb, count, READ);
if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!down_read_trylock(&fi->i_gc_rwsem[READ])) {
+ if (!f2fs_down_read_trylock(&fi->i_gc_rwsem[READ])) {
ret = -EAGAIN;
goto out;
}
} else {
- down_read(&fi->i_gc_rwsem[READ]);
+ f2fs_down_read(&fi->i_gc_rwsem[READ]);
}
/*
@@ -4315,7 +4318,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
ret = iomap_dio_complete(dio);
}
- up_read(&fi->i_gc_rwsem[READ]);
+ f2fs_up_read(&fi->i_gc_rwsem[READ]);
file_accessed(file);
out:
@@ -4445,7 +4448,7 @@ static ssize_t f2fs_buffered_write_iter(struct kiocb *iocb,
return -EOPNOTSUPP;
current->backing_dev_info = inode_to_bdi(inode);
- ret = generic_perform_write(file, from, iocb->ki_pos);
+ ret = generic_perform_write(iocb, from);
current->backing_dev_info = NULL;
if (ret > 0) {
@@ -4479,10 +4482,8 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
const bool do_opu = f2fs_lfs_mode(sbi);
- const int whint_mode = F2FS_OPTION(sbi).whint_mode;
const loff_t pos = iocb->ki_pos;
const ssize_t count = iov_iter_count(from);
- const enum rw_hint hint = iocb->ki_hint;
unsigned int dio_flags;
struct iomap_dio *dio;
ssize_t ret;
@@ -4497,12 +4498,12 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
goto out;
}
- if (!down_read_trylock(&fi->i_gc_rwsem[WRITE])) {
+ if (!f2fs_down_read_trylock(&fi->i_gc_rwsem[WRITE])) {
ret = -EAGAIN;
goto out;
}
- if (do_opu && !down_read_trylock(&fi->i_gc_rwsem[READ])) {
- up_read(&fi->i_gc_rwsem[WRITE]);
+ if (do_opu && !f2fs_down_read_trylock(&fi->i_gc_rwsem[READ])) {
+ f2fs_up_read(&fi->i_gc_rwsem[WRITE]);
ret = -EAGAIN;
goto out;
}
@@ -4511,12 +4512,10 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
if (ret)
goto out;
- down_read(&fi->i_gc_rwsem[WRITE]);
+ f2fs_down_read(&fi->i_gc_rwsem[WRITE]);
if (do_opu)
- down_read(&fi->i_gc_rwsem[READ]);
+ f2fs_down_read(&fi->i_gc_rwsem[READ]);
}
- if (whint_mode == WHINT_MODE_OFF)
- iocb->ki_hint = WRITE_LIFE_NOT_SET;
/*
* We have to use __iomap_dio_rw() and iomap_dio_complete() instead of
@@ -4539,11 +4538,9 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
ret = iomap_dio_complete(dio);
}
- if (whint_mode == WHINT_MODE_OFF)
- iocb->ki_hint = hint;
if (do_opu)
- up_read(&fi->i_gc_rwsem[READ]);
- up_read(&fi->i_gc_rwsem[WRITE]);
+ f2fs_up_read(&fi->i_gc_rwsem[READ]);
+ f2fs_up_read(&fi->i_gc_rwsem[WRITE]);
if (ret < 0)
goto out;
@@ -4644,12 +4641,12 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
/* Don't leave any preallocated blocks around past i_size. */
if (preallocated && i_size_read(inode) < target_size) {
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
if (!f2fs_truncate(inode))
file_dont_truncate(inode);
filemap_invalidate_unlock(inode->i_mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
} else {
file_dont_truncate(inode);
}
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index ee308a8de432..ea5b93b689cd 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -103,23 +103,26 @@ static int gc_thread_func(void *data)
sbi->gc_urgent_high_remaining--;
}
spin_unlock(&sbi->gc_urgent_high_lock);
+ }
+ if (sbi->gc_mode == GC_URGENT_HIGH ||
+ sbi->gc_mode == GC_URGENT_MID) {
wait_ms = gc_th->urgent_sleep_time;
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
goto do_gc;
}
if (foreground) {
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
goto do_gc;
- } else if (!down_write_trylock(&sbi->gc_lock)) {
+ } else if (!f2fs_down_write_trylock(&sbi->gc_lock)) {
stat_other_skip_bggc_count(sbi);
goto next;
}
if (!is_idle(sbi, GC_TIME)) {
increase_sleep_time(gc_th, &wait_ms);
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->gc_lock);
stat_io_skip_bggc_count(sbi);
goto next;
}
@@ -1038,8 +1041,10 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
set_sbi_flag(sbi, SBI_NEED_FSCK);
}
- if (f2fs_check_nid_range(sbi, dni->ino))
+ if (f2fs_check_nid_range(sbi, dni->ino)) {
+ f2fs_put_page(node_page, 1);
return false;
+ }
*nofs = ofs_of_node(node_page);
source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node);
@@ -1230,7 +1235,7 @@ static int move_data_block(struct inode *inode, block_t bidx,
fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
if (lfs_mode)
- down_write(&fio.sbi->io_order_lock);
+ f2fs_down_write(&fio.sbi->io_order_lock);
mpage = f2fs_grab_cache_page(META_MAPPING(fio.sbi),
fio.old_blkaddr, false);
@@ -1316,7 +1321,7 @@ recover_block:
true, true, true);
up_out:
if (lfs_mode)
- up_write(&fio.sbi->io_order_lock);
+ f2fs_up_write(&fio.sbi->io_order_lock);
put_out:
f2fs_put_dnode(&dn);
out:
@@ -1475,7 +1480,7 @@ next_step:
special_file(inode->i_mode))
continue;
- if (!down_write_trylock(
+ if (!f2fs_down_write_trylock(
&F2FS_I(inode)->i_gc_rwsem[WRITE])) {
iput(inode);
sbi->skipped_gc_rwsem++;
@@ -1488,7 +1493,7 @@ next_step:
if (f2fs_post_read_required(inode)) {
int err = ra_data_block(inode, start_bidx);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
if (err) {
iput(inode);
continue;
@@ -1499,7 +1504,7 @@ next_step:
data_page = f2fs_get_read_data_page(inode,
start_bidx, REQ_RAHEAD, true);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
if (IS_ERR(data_page)) {
iput(inode);
continue;
@@ -1518,14 +1523,14 @@ next_step:
int err;
if (S_ISREG(inode->i_mode)) {
- if (!down_write_trylock(&fi->i_gc_rwsem[READ])) {
+ if (!f2fs_down_write_trylock(&fi->i_gc_rwsem[READ])) {
sbi->skipped_gc_rwsem++;
continue;
}
- if (!down_write_trylock(
+ if (!f2fs_down_write_trylock(
&fi->i_gc_rwsem[WRITE])) {
sbi->skipped_gc_rwsem++;
- up_write(&fi->i_gc_rwsem[READ]);
+ f2fs_up_write(&fi->i_gc_rwsem[READ]);
continue;
}
locked = true;
@@ -1548,8 +1553,8 @@ next_step:
submitted++;
if (locked) {
- up_write(&fi->i_gc_rwsem[WRITE]);
- up_write(&fi->i_gc_rwsem[READ]);
+ f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&fi->i_gc_rwsem[READ]);
}
stat_inc_data_blk_count(sbi, 1, gc_type);
@@ -1807,7 +1812,7 @@ stop:
reserved_segments(sbi),
prefree_segments(sbi));
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->gc_lock);
put_gc_inode(&gc_list);
@@ -1936,7 +1941,7 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs)
long long block_count;
int segs = secs * sbi->segs_per_sec;
- down_write(&sbi->sb_lock);
+ f2fs_down_write(&sbi->sb_lock);
section_count = le32_to_cpu(raw_sb->section_count);
segment_count = le32_to_cpu(raw_sb->segment_count);
@@ -1957,7 +1962,7 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs)
cpu_to_le32(dev_segs + segs);
}
- up_write(&sbi->sb_lock);
+ f2fs_up_write(&sbi->sb_lock);
}
static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs)
@@ -2031,7 +2036,7 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count)
secs = div_u64(shrunk_blocks, BLKS_PER_SEC(sbi));
/* stop other GC */
- if (!down_write_trylock(&sbi->gc_lock))
+ if (!f2fs_down_write_trylock(&sbi->gc_lock))
return -EAGAIN;
/* stop CP to protect MAIN_SEC in free_segment_range */
@@ -2051,15 +2056,15 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count)
out_unlock:
f2fs_unlock_op(sbi);
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->gc_lock);
if (err)
return err;
set_sbi_flag(sbi, SBI_IS_RESIZEFS);
freeze_super(sbi->sb);
- down_write(&sbi->gc_lock);
- down_write(&sbi->cp_global_sem);
+ f2fs_down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->cp_global_sem);
spin_lock(&sbi->stat_lock);
if (shrunk_blocks + valid_user_blocks(sbi) +
@@ -2104,8 +2109,8 @@ recover_out:
spin_unlock(&sbi->stat_lock);
}
out_err:
- up_write(&sbi->cp_global_sem);
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->cp_global_sem);
+ f2fs_up_write(&sbi->gc_lock);
thaw_super(sbi->sb);
clear_sbi_flag(sbi, SBI_IS_RESIZEFS);
return err;
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 4b5cefa3f90c..a578bf83b803 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -629,7 +629,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname,
}
if (inode) {
- down_write(&F2FS_I(inode)->i_sem);
+ f2fs_down_write(&F2FS_I(inode)->i_sem);
page = f2fs_init_inode_metadata(inode, dir, fname, ipage);
if (IS_ERR(page)) {
err = PTR_ERR(page);
@@ -658,7 +658,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname,
f2fs_update_parent_metadata(dir, inode, 0);
fail:
if (inode)
- up_write(&F2FS_I(inode)->i_sem);
+ f2fs_up_write(&F2FS_I(inode)->i_sem);
out:
f2fs_put_page(ipage, 1);
return err;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 0ec8e32a00b4..83639238a1fe 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -550,7 +550,8 @@ make_now:
}
f2fs_set_inode_flags(inode);
- if (file_should_truncate(inode)) {
+ if (file_should_truncate(inode) &&
+ !is_sbi_flag_set(sbi, SBI_POR_DOING)) {
ret = f2fs_truncate(inode);
if (ret)
goto bad_inode;
@@ -778,7 +779,8 @@ void f2fs_evict_inode(struct inode *inode)
f2fs_remove_ino_entry(sbi, inode->i_ino, UPDATE_INO);
f2fs_remove_ino_entry(sbi, inode->i_ino, FLUSH_INO);
- sb_start_intwrite(inode->i_sb);
+ if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING))
+ sb_start_intwrite(inode->i_sb);
set_inode_flag(inode, FI_NO_ALLOC);
i_size_write(inode, 0);
retry:
@@ -809,7 +811,8 @@ retry:
if (dquot_initialize_needed(inode))
set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
}
- sb_end_intwrite(inode->i_sb);
+ if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING))
+ sb_end_intwrite(inode->i_sb);
no_delete:
dquot_drop(inode);
@@ -885,6 +888,7 @@ void f2fs_handle_failed_inode(struct inode *inode)
err = f2fs_get_node_info(sbi, inode->i_ino, &ni, false);
if (err) {
set_sbi_flag(sbi, SBI_NEED_FSCK);
+ set_inode_flag(inode, FI_FREE_NID);
f2fs_warn(sbi, "May loss orphan inode, run fsck to fix.");
goto out;
}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 5f213f05556d..5ed79b29999f 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -22,7 +22,8 @@
#include "acl.h"
#include <trace/events/f2fs.h>
-static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
+static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
+ struct inode *dir, umode_t mode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
nid_t ino;
@@ -46,7 +47,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
nid_free = true;
- inode_init_owner(&init_user_ns, inode, dir, mode);
+ inode_init_owner(mnt_userns, inode, dir, mode);
inode->i_ino = ino;
inode->i_blocks = 0;
@@ -67,7 +68,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
(F2FS_I(dir)->i_flags & F2FS_PROJINHERIT_FL))
F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid;
else
- F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns,
+ F2FS_I(inode)->i_projid = make_kprojid(mnt_userns,
F2FS_DEF_PROJID);
err = fscrypt_prepare_new_inode(dir, inode, &encrypt);
@@ -196,7 +197,7 @@ static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *
__u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
int i, cold_count, hot_count;
- down_read(&sbi->sb_lock);
+ f2fs_down_read(&sbi->sb_lock);
cold_count = le32_to_cpu(sbi->raw_super->extension_count);
hot_count = sbi->raw_super->hot_ext_count;
@@ -206,7 +207,7 @@ static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *
break;
}
- up_read(&sbi->sb_lock);
+ f2fs_up_read(&sbi->sb_lock);
if (i == cold_count + hot_count)
return;
@@ -299,19 +300,19 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode,
(!ext_cnt && !noext_cnt))
return;
- down_read(&sbi->sb_lock);
+ f2fs_down_read(&sbi->sb_lock);
cold_count = le32_to_cpu(sbi->raw_super->extension_count);
hot_count = sbi->raw_super->hot_ext_count;
for (i = cold_count; i < cold_count + hot_count; i++) {
if (is_extension_exist(name, extlist[i], false)) {
- up_read(&sbi->sb_lock);
+ f2fs_up_read(&sbi->sb_lock);
return;
}
}
- up_read(&sbi->sb_lock);
+ f2fs_up_read(&sbi->sb_lock);
for (i = 0; i < noext_cnt; i++) {
if (is_extension_exist(name, noext[i], false)) {
@@ -349,7 +350,7 @@ static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
return err;
- inode = f2fs_new_inode(dir, mode);
+ inode = f2fs_new_inode(mnt_userns, dir, mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -679,7 +680,7 @@ static int f2fs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
return err;
- inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO);
+ inode = f2fs_new_inode(mnt_userns, dir, S_IFLNK | S_IRWXUGO);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -750,7 +751,7 @@ static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
return err;
- inode = f2fs_new_inode(dir, S_IFDIR | mode);
+ inode = f2fs_new_inode(mnt_userns, dir, S_IFDIR | mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -807,7 +808,7 @@ static int f2fs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
return err;
- inode = f2fs_new_inode(dir, mode);
+ inode = f2fs_new_inode(mnt_userns, dir, mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -834,8 +835,9 @@ out:
return err;
}
-static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
- umode_t mode, struct inode **whiteout)
+static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode,
+ struct inode **whiteout)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
@@ -845,7 +847,7 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
if (err)
return err;
- inode = f2fs_new_inode(dir, mode);
+ inode = f2fs_new_inode(mnt_userns, dir, mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -909,20 +911,22 @@ static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
if (!f2fs_is_checkpoint_ready(sbi))
return -ENOSPC;
- return __f2fs_tmpfile(dir, dentry, mode, NULL);
+ return __f2fs_tmpfile(mnt_userns, dir, dentry, mode, NULL);
}
-static int f2fs_create_whiteout(struct inode *dir, struct inode **whiteout)
+static int f2fs_create_whiteout(struct user_namespace *mnt_userns,
+ struct inode *dir, struct inode **whiteout)
{
if (unlikely(f2fs_cp_error(F2FS_I_SB(dir))))
return -EIO;
- return __f2fs_tmpfile(dir, NULL, S_IFCHR | WHITEOUT_MODE, whiteout);
+ return __f2fs_tmpfile(mnt_userns, dir, NULL,
+ S_IFCHR | WHITEOUT_MODE, whiteout);
}
-static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- unsigned int flags)
+static int f2fs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+ struct dentry *old_dentry, struct inode *new_dir,
+ struct dentry *new_dentry, unsigned int flags)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir);
struct inode *old_inode = d_inode(old_dentry);
@@ -960,7 +964,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
if (flags & RENAME_WHITEOUT) {
- err = f2fs_create_whiteout(old_dir, &whiteout);
+ err = f2fs_create_whiteout(mnt_userns, old_dir, &whiteout);
if (err)
return err;
}
@@ -1023,11 +1027,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
new_page = NULL;
new_inode->i_ctime = current_time(new_inode);
- down_write(&F2FS_I(new_inode)->i_sem);
+ f2fs_down_write(&F2FS_I(new_inode)->i_sem);
if (old_dir_entry)
f2fs_i_links_write(new_inode, false);
f2fs_i_links_write(new_inode, false);
- up_write(&F2FS_I(new_inode)->i_sem);
+ f2fs_up_write(&F2FS_I(new_inode)->i_sem);
if (!new_inode->i_nlink)
f2fs_add_orphan_inode(new_inode);
@@ -1048,13 +1052,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_i_links_write(new_dir, true);
}
- down_write(&F2FS_I(old_inode)->i_sem);
+ f2fs_down_write(&F2FS_I(old_inode)->i_sem);
if (!old_dir_entry || whiteout)
file_lost_pino(old_inode);
else
/* adjust dir's i_pino to pass fsck check */
f2fs_i_pino_write(old_inode, new_dir->i_ino);
- up_write(&F2FS_I(old_inode)->i_sem);
+ f2fs_up_write(&F2FS_I(old_inode)->i_sem);
old_inode->i_ctime = current_time(old_inode);
f2fs_mark_inode_dirty_sync(old_inode, false);
@@ -1107,8 +1111,7 @@ out_dir:
out_old:
f2fs_put_page(old_page, 0);
out:
- if (whiteout)
- iput(whiteout);
+ iput(whiteout);
return err;
}
@@ -1214,38 +1217,38 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
/* update directory entry info of old dir inode */
f2fs_set_link(old_dir, old_entry, old_page, new_inode);
- down_write(&F2FS_I(old_inode)->i_sem);
+ f2fs_down_write(&F2FS_I(old_inode)->i_sem);
if (!old_dir_entry)
file_lost_pino(old_inode);
else
/* adjust dir's i_pino to pass fsck check */
f2fs_i_pino_write(old_inode, new_dir->i_ino);
- up_write(&F2FS_I(old_inode)->i_sem);
+ f2fs_up_write(&F2FS_I(old_inode)->i_sem);
old_dir->i_ctime = current_time(old_dir);
if (old_nlink) {
- down_write(&F2FS_I(old_dir)->i_sem);
+ f2fs_down_write(&F2FS_I(old_dir)->i_sem);
f2fs_i_links_write(old_dir, old_nlink > 0);
- up_write(&F2FS_I(old_dir)->i_sem);
+ f2fs_up_write(&F2FS_I(old_dir)->i_sem);
}
f2fs_mark_inode_dirty_sync(old_dir, false);
/* update directory entry info of new dir inode */
f2fs_set_link(new_dir, new_entry, new_page, old_inode);
- down_write(&F2FS_I(new_inode)->i_sem);
+ f2fs_down_write(&F2FS_I(new_inode)->i_sem);
if (!new_dir_entry)
file_lost_pino(new_inode);
else
/* adjust dir's i_pino to pass fsck check */
f2fs_i_pino_write(new_inode, old_dir->i_ino);
- up_write(&F2FS_I(new_inode)->i_sem);
+ f2fs_up_write(&F2FS_I(new_inode)->i_sem);
new_dir->i_ctime = current_time(new_dir);
if (new_nlink) {
- down_write(&F2FS_I(new_dir)->i_sem);
+ f2fs_down_write(&F2FS_I(new_dir)->i_sem);
f2fs_i_links_write(new_dir, new_nlink > 0);
- up_write(&F2FS_I(new_dir)->i_sem);
+ f2fs_up_write(&F2FS_I(new_dir)->i_sem);
}
f2fs_mark_inode_dirty_sync(new_dir, false);
@@ -1300,7 +1303,8 @@ static int f2fs_rename2(struct user_namespace *mnt_userns,
* VFS has already handled the new dentry existence case,
* here, we just deal with "RENAME_NOREPLACE" as regular rename.
*/
- return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
+ return f2fs_rename(mnt_userns, old_dir, old_dentry,
+ new_dir, new_dentry, flags);
}
static const char *f2fs_encrypted_get_link(struct dentry *dentry,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 50b2874e758c..c45d341dcf6e 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -382,14 +382,14 @@ int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
struct nat_entry *e;
bool need = false;
- down_read(&nm_i->nat_tree_lock);
+ f2fs_down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (e) {
if (!get_nat_flag(e, IS_CHECKPOINTED) &&
!get_nat_flag(e, HAS_FSYNCED_INODE))
need = true;
}
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
return need;
}
@@ -399,11 +399,11 @@ bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
struct nat_entry *e;
bool is_cp = true;
- down_read(&nm_i->nat_tree_lock);
+ f2fs_down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (e && !get_nat_flag(e, IS_CHECKPOINTED))
is_cp = false;
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
return is_cp;
}
@@ -413,13 +413,13 @@ bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
struct nat_entry *e;
bool need_update = true;
- down_read(&nm_i->nat_tree_lock);
+ f2fs_down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, ino);
if (e && get_nat_flag(e, HAS_LAST_FSYNC) &&
(get_nat_flag(e, IS_CHECKPOINTED) ||
get_nat_flag(e, HAS_FSYNCED_INODE)))
need_update = false;
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
return need_update;
}
@@ -431,14 +431,14 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
struct nat_entry *new, *e;
/* Let's mitigate lock contention of nat_tree_lock during checkpoint */
- if (rwsem_is_locked(&sbi->cp_global_sem))
+ if (f2fs_rwsem_is_locked(&sbi->cp_global_sem))
return;
new = __alloc_nat_entry(sbi, nid, false);
if (!new)
return;
- down_write(&nm_i->nat_tree_lock);
+ f2fs_down_write(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (!e)
e = __init_nat_entry(nm_i, new, ne, false);
@@ -447,7 +447,7 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
nat_get_blkaddr(e) !=
le32_to_cpu(ne->block_addr) ||
nat_get_version(e) != ne->version);
- up_write(&nm_i->nat_tree_lock);
+ f2fs_up_write(&nm_i->nat_tree_lock);
if (e != new)
__free_nat_entry(new);
}
@@ -459,7 +459,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
struct nat_entry *e;
struct nat_entry *new = __alloc_nat_entry(sbi, ni->nid, true);
- down_write(&nm_i->nat_tree_lock);
+ f2fs_down_write(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, ni->nid);
if (!e) {
e = __init_nat_entry(nm_i, new, NULL, true);
@@ -508,7 +508,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
set_nat_flag(e, HAS_FSYNCED_INODE, true);
set_nat_flag(e, HAS_LAST_FSYNC, fsync_done);
}
- up_write(&nm_i->nat_tree_lock);
+ f2fs_up_write(&nm_i->nat_tree_lock);
}
int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
@@ -516,7 +516,7 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
struct f2fs_nm_info *nm_i = NM_I(sbi);
int nr = nr_shrink;
- if (!down_write_trylock(&nm_i->nat_tree_lock))
+ if (!f2fs_down_write_trylock(&nm_i->nat_tree_lock))
return 0;
spin_lock(&nm_i->nat_list_lock);
@@ -538,7 +538,7 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
}
spin_unlock(&nm_i->nat_list_lock);
- up_write(&nm_i->nat_tree_lock);
+ f2fs_up_write(&nm_i->nat_tree_lock);
return nr - nr_shrink;
}
@@ -560,13 +560,13 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
ni->nid = nid;
retry:
/* Check nat cache */
- down_read(&nm_i->nat_tree_lock);
+ f2fs_down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (e) {
ni->ino = nat_get_ino(e);
ni->blk_addr = nat_get_blkaddr(e);
ni->version = nat_get_version(e);
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
return 0;
}
@@ -576,11 +576,11 @@ retry:
* nat_tree_lock. Therefore, we should retry, if we failed to grab here
* while not bothering checkpoint.
*/
- if (!rwsem_is_locked(&sbi->cp_global_sem) || checkpoint_context) {
+ if (!f2fs_rwsem_is_locked(&sbi->cp_global_sem) || checkpoint_context) {
down_read(&curseg->journal_rwsem);
- } else if (rwsem_is_contended(&nm_i->nat_tree_lock) ||
+ } else if (f2fs_rwsem_is_contended(&nm_i->nat_tree_lock) ||
!down_read_trylock(&curseg->journal_rwsem)) {
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
goto retry;
}
@@ -589,15 +589,15 @@ retry:
ne = nat_in_journal(journal, i);
node_info_from_raw_nat(ni, &ne);
}
- up_read(&curseg->journal_rwsem);
+ up_read(&curseg->journal_rwsem);
if (i >= 0) {
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
goto cache;
}
/* Fill node_info from nat page */
index = current_nat_addr(sbi, nid);
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
page = f2fs_get_meta_page(sbi, index);
if (IS_ERR(page))
@@ -1609,17 +1609,17 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
goto redirty_out;
if (wbc->for_reclaim) {
- if (!down_read_trylock(&sbi->node_write))
+ if (!f2fs_down_read_trylock(&sbi->node_write))
goto redirty_out;
} else {
- down_read(&sbi->node_write);
+ f2fs_down_read(&sbi->node_write);
}
/* This page is already truncated */
if (unlikely(ni.blk_addr == NULL_ADDR)) {
ClearPageUptodate(page);
dec_page_count(sbi, F2FS_DIRTY_NODES);
- up_read(&sbi->node_write);
+ f2fs_up_read(&sbi->node_write);
unlock_page(page);
return 0;
}
@@ -1627,7 +1627,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
if (__is_valid_data_blkaddr(ni.blk_addr) &&
!f2fs_is_valid_blkaddr(sbi, ni.blk_addr,
DATA_GENERIC_ENHANCE)) {
- up_read(&sbi->node_write);
+ f2fs_up_read(&sbi->node_write);
goto redirty_out;
}
@@ -1648,7 +1648,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
f2fs_do_write_node_page(nid, &fio);
set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
dec_page_count(sbi, F2FS_DIRTY_NODES);
- up_read(&sbi->node_write);
+ f2fs_up_read(&sbi->node_write);
if (wbc->for_reclaim) {
f2fs_submit_merged_write_cond(sbi, NULL, page, 0, NODE);
@@ -1782,6 +1782,7 @@ continue_unlock:
if (!atomic || page == last_page) {
set_fsync_mark(page, 1);
+ percpu_counter_inc(&sbi->rf_node_block_count);
if (IS_INODE(page)) {
if (is_inode_flag_set(inode,
FI_DIRTY_INODE))
@@ -2111,8 +2112,12 @@ static int f2fs_write_node_pages(struct address_space *mapping,
if (wbc->sync_mode == WB_SYNC_ALL)
atomic_inc(&sbi->wb_sync_req[NODE]);
- else if (atomic_read(&sbi->wb_sync_req[NODE]))
+ else if (atomic_read(&sbi->wb_sync_req[NODE])) {
+ /* to avoid potential deadlock */
+ if (current->plug)
+ blk_finish_plug(current->plug);
goto skip_write;
+ }
trace_f2fs_writepages(mapping->host, wbc, NODE);
@@ -2132,23 +2137,24 @@ skip_write:
return 0;
}
-static int f2fs_set_node_page_dirty(struct page *page)
+static bool f2fs_dirty_node_folio(struct address_space *mapping,
+ struct folio *folio)
{
- trace_f2fs_set_page_dirty(page, NODE);
+ trace_f2fs_set_page_dirty(&folio->page, NODE);
- if (!PageUptodate(page))
- SetPageUptodate(page);
+ if (!folio_test_uptodate(folio))
+ folio_mark_uptodate(folio);
#ifdef CONFIG_F2FS_CHECK_FS
- if (IS_INODE(page))
- f2fs_inode_chksum_set(F2FS_P_SB(page), page);
+ if (IS_INODE(&folio->page))
+ f2fs_inode_chksum_set(F2FS_M_SB(mapping), &folio->page);
#endif
- if (!PageDirty(page)) {
- __set_page_dirty_nobuffers(page);
- inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
- set_page_private_reference(page);
- return 1;
+ if (!folio_test_dirty(folio)) {
+ filemap_dirty_folio(mapping, folio);
+ inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES);
+ set_page_private_reference(&folio->page);
+ return true;
}
- return 0;
+ return false;
}
/*
@@ -2157,8 +2163,8 @@ static int f2fs_set_node_page_dirty(struct page *page)
const struct address_space_operations f2fs_node_aops = {
.writepage = f2fs_write_node_page,
.writepages = f2fs_write_node_pages,
- .set_page_dirty = f2fs_set_node_page_dirty,
- .invalidatepage = f2fs_invalidate_page,
+ .dirty_folio = f2fs_dirty_node_folio,
+ .invalidate_folio = f2fs_invalidate_folio,
.releasepage = f2fs_release_page,
#ifdef CONFIG_MIGRATION
.migratepage = f2fs_migrate_page,
@@ -2225,14 +2231,14 @@ bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi)
unsigned int i;
bool ret = true;
- down_read(&nm_i->nat_tree_lock);
+ f2fs_down_read(&nm_i->nat_tree_lock);
for (i = 0; i < nm_i->nat_blocks; i++) {
if (!test_bit_le(i, nm_i->nat_block_bitmap)) {
ret = false;
break;
}
}
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
return ret;
}
@@ -2415,7 +2421,7 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
unsigned int i, idx;
nid_t nid;
- down_read(&nm_i->nat_tree_lock);
+ f2fs_down_read(&nm_i->nat_tree_lock);
for (i = 0; i < nm_i->nat_blocks; i++) {
if (!test_bit_le(i, nm_i->nat_block_bitmap))
@@ -2438,7 +2444,7 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
out:
scan_curseg_cache(sbi);
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
}
static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
@@ -2473,7 +2479,7 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
META_NAT, true);
- down_read(&nm_i->nat_tree_lock);
+ f2fs_down_read(&nm_i->nat_tree_lock);
while (1) {
if (!test_bit_le(NAT_BLOCK_OFFSET(nid),
@@ -2488,7 +2494,7 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
}
if (ret) {
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
f2fs_err(sbi, "NAT is corrupt, run fsck to fix it");
return ret;
}
@@ -2508,7 +2514,7 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
/* find free nids from current sum_pages */
scan_curseg_cache(sbi);
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
nm_i->ra_nid_pages, META_NAT, false);
@@ -2953,7 +2959,7 @@ void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi)
struct f2fs_nm_info *nm_i = NM_I(sbi);
unsigned int nat_ofs;
- down_read(&nm_i->nat_tree_lock);
+ f2fs_down_read(&nm_i->nat_tree_lock);
for (nat_ofs = 0; nat_ofs < nm_i->nat_blocks; nat_ofs++) {
unsigned int valid = 0, nid_ofs = 0;
@@ -2973,7 +2979,7 @@ void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi)
__update_nat_bits(nm_i, nat_ofs, valid);
}
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
}
static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
@@ -3071,15 +3077,15 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
* nat_cnt[DIRTY_NAT].
*/
if (cpc->reason & CP_UMOUNT) {
- down_write(&nm_i->nat_tree_lock);
+ f2fs_down_write(&nm_i->nat_tree_lock);
remove_nats_in_journal(sbi);
- up_write(&nm_i->nat_tree_lock);
+ f2fs_up_write(&nm_i->nat_tree_lock);
}
if (!nm_i->nat_cnt[DIRTY_NAT])
return 0;
- down_write(&nm_i->nat_tree_lock);
+ f2fs_down_write(&nm_i->nat_tree_lock);
/*
* if there are no enough space in journal to store dirty nat
@@ -3108,7 +3114,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
break;
}
- up_write(&nm_i->nat_tree_lock);
+ f2fs_up_write(&nm_i->nat_tree_lock);
/* Allow dirty nats by node block allocation in write_begin */
return err;
@@ -3218,6 +3224,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
nm_i->ram_thresh = DEF_RAM_THRESHOLD;
nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD;
+ nm_i->max_rf_node_blocks = DEF_RF_NODE_BLOCKS;
INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
INIT_LIST_HEAD(&nm_i->free_nid_list);
@@ -3228,7 +3235,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
mutex_init(&nm_i->build_lock);
spin_lock_init(&nm_i->nid_list_lock);
- init_rwsem(&nm_i->nat_tree_lock);
+ init_f2fs_rwsem(&nm_i->nat_tree_lock);
nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
@@ -3334,7 +3341,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi)
spin_unlock(&nm_i->nid_list_lock);
/* destroy nat cache */
- down_write(&nm_i->nat_tree_lock);
+ f2fs_down_write(&nm_i->nat_tree_lock);
while ((found = __gang_lookup_nat_cache(nm_i,
nid, NATVEC_SIZE, natvec))) {
unsigned idx;
@@ -3364,7 +3371,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi)
kmem_cache_free(nat_entry_set_slab, setvec[idx]);
}
}
- up_write(&nm_i->nat_tree_lock);
+ f2fs_up_write(&nm_i->nat_tree_lock);
kvfree(nm_i->nat_block_bitmap);
if (nm_i->free_nid_bitmap) {
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 18b98cf0465b..4c1d34bfea78 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -31,6 +31,9 @@
/* control total # of nats */
#define DEF_NAT_CACHE_THRESHOLD 100000
+/* control total # of node writes used for roll-fowrad recovery */
+#define DEF_RF_NODE_BLOCKS 0
+
/* vector size for gang look-up from nat cache that consists of radix tree */
#define NATVEC_SIZE 64
#define SETVEC_SIZE 32
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 79773d322c47..3cb7f8a43b4d 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -56,6 +56,10 @@ bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi)
if (sbi->last_valid_block_count + nalloc > sbi->user_block_count)
return false;
+ if (NM_I(sbi)->max_rf_node_blocks &&
+ percpu_counter_sum_positive(&sbi->rf_node_block_count) >=
+ NM_I(sbi)->max_rf_node_blocks)
+ return false;
return true;
}
@@ -343,6 +347,19 @@ static int recover_inode(struct inode *inode, struct page *page)
return 0;
}
+static unsigned int adjust_por_ra_blocks(struct f2fs_sb_info *sbi,
+ unsigned int ra_blocks, unsigned int blkaddr,
+ unsigned int next_blkaddr)
+{
+ if (blkaddr + 1 == next_blkaddr)
+ ra_blocks = min_t(unsigned int, RECOVERY_MAX_RA_BLOCKS,
+ ra_blocks * 2);
+ else if (next_blkaddr % sbi->blocks_per_seg)
+ ra_blocks = max_t(unsigned int, RECOVERY_MIN_RA_BLOCKS,
+ ra_blocks / 2);
+ return ra_blocks;
+}
+
static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
bool check_only)
{
@@ -350,6 +367,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
struct page *page = NULL;
block_t blkaddr;
unsigned int loop_cnt = 0;
+ unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS;
unsigned int free_blocks = MAIN_SEGS(sbi) * sbi->blocks_per_seg -
valid_user_blocks(sbi);
int err = 0;
@@ -424,11 +442,14 @@ next:
break;
}
+ ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, blkaddr,
+ next_blkaddr_of_node(page));
+
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
f2fs_put_page(page, 1);
- f2fs_ra_meta_pages_cond(sbi, blkaddr);
+ f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks);
}
return err;
}
@@ -704,6 +725,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
struct page *page = NULL;
int err = 0;
block_t blkaddr;
+ unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS;
/* get node pages in the current segment */
curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
@@ -715,8 +737,6 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR))
break;
- f2fs_ra_meta_pages_cond(sbi, blkaddr);
-
page = f2fs_get_tmp_page(sbi, blkaddr);
if (IS_ERR(page)) {
err = PTR_ERR(page);
@@ -759,9 +779,14 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
if (entry->blkaddr == blkaddr)
list_move_tail(&entry->list, tmp_inode_list);
next:
+ ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, blkaddr,
+ next_blkaddr_of_node(page));
+
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
f2fs_put_page(page, 1);
+
+ f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks);
}
if (!err)
f2fs_allocate_new_segments(sbi);
@@ -796,7 +821,7 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
INIT_LIST_HEAD(&dir_list);
/* prevent checkpoint */
- down_write(&sbi->cp_global_sem);
+ f2fs_down_write(&sbi->cp_global_sem);
/* step #1: find fsynced inode numbers */
err = find_fsync_dnodes(sbi, &inode_list, check_only);
@@ -845,7 +870,7 @@ skip:
if (!err)
clear_sbi_flag(sbi, SBI_POR_DOING);
- up_write(&sbi->cp_global_sem);
+ f2fs_up_write(&sbi->cp_global_sem);
/* let's drop all the directory inodes for clean checkpoint */
destroy_fsync_dnodes(&dir_list, err);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 1dabc8244083..bd9731cdec56 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -313,8 +313,7 @@ next:
skip:
iput(inode);
}
- congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
- cond_resched();
+ f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
if (gc_failure) {
if (++looped >= count)
return;
@@ -471,7 +470,7 @@ int f2fs_commit_inmem_pages(struct inode *inode)
f2fs_balance_fs(sbi, true);
- down_write(&fi->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&fi->i_gc_rwsem[WRITE]);
f2fs_lock_op(sbi);
set_inode_flag(inode, FI_ATOMIC_COMMIT);
@@ -483,7 +482,7 @@ int f2fs_commit_inmem_pages(struct inode *inode)
clear_inode_flag(inode, FI_ATOMIC_COMMIT);
f2fs_unlock_op(sbi);
- up_write(&fi->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
return err;
}
@@ -521,7 +520,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
io_schedule();
finish_wait(&sbi->gc_thread->fggc_wq, &wait);
} else {
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
f2fs_gc(sbi, false, false, false, NULL_SEGNO);
}
}
@@ -529,7 +528,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
static inline bool excess_dirty_threshold(struct f2fs_sb_info *sbi)
{
- int factor = rwsem_is_locked(&sbi->cp_rwsem) ? 3 : 2;
+ int factor = f2fs_rwsem_is_locked(&sbi->cp_rwsem) ? 3 : 2;
unsigned int dents = get_pages(sbi, F2FS_DIRTY_DENTS);
unsigned int qdata = get_pages(sbi, F2FS_DIRTY_QDATA);
unsigned int nodes = get_pages(sbi, F2FS_DIRTY_NODES);
@@ -570,7 +569,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg)
/* there is background inflight IO or foreground operation recently */
if (is_inflight_io(sbi, REQ_TIME) ||
- (!f2fs_time_over(sbi, REQ_TIME) && rwsem_is_locked(&sbi->cp_rwsem)))
+ (!f2fs_time_over(sbi, REQ_TIME) && f2fs_rwsem_is_locked(&sbi->cp_rwsem)))
return;
/* exceed periodical checkpoint timeout threshold */
@@ -803,8 +802,7 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi)
do {
ret = __submit_flush_wait(sbi, FDEV(i).bdev);
if (ret)
- congestion_wait(BLK_RW_ASYNC,
- DEFAULT_IO_TIMEOUT);
+ f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
} while (ret && --count);
if (ret) {
@@ -1156,14 +1154,14 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
dpolicy->ordered = false;
dpolicy->granularity = granularity;
- dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
+ dpolicy->max_requests = dcc->max_discard_request;
dpolicy->io_aware_gran = MAX_PLIST_NUM;
dpolicy->timeout = false;
if (discard_type == DPOLICY_BG) {
- dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
- dpolicy->mid_interval = DEF_MID_DISCARD_ISSUE_TIME;
- dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME;
+ dpolicy->min_interval = dcc->min_discard_issue_time;
+ dpolicy->mid_interval = dcc->mid_discard_issue_time;
+ dpolicy->max_interval = dcc->max_discard_issue_time;
dpolicy->io_aware = true;
dpolicy->sync = false;
dpolicy->ordered = true;
@@ -1171,12 +1169,12 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
dpolicy->granularity = 1;
if (atomic_read(&dcc->discard_cmd_cnt))
dpolicy->max_interval =
- DEF_MIN_DISCARD_ISSUE_TIME;
+ dcc->min_discard_issue_time;
}
} else if (discard_type == DPOLICY_FORCE) {
- dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
- dpolicy->mid_interval = DEF_MID_DISCARD_ISSUE_TIME;
- dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME;
+ dpolicy->min_interval = dcc->min_discard_issue_time;
+ dpolicy->mid_interval = dcc->mid_discard_issue_time;
+ dpolicy->max_interval = dcc->max_discard_issue_time;
dpolicy->io_aware = false;
} else if (discard_type == DPOLICY_FSTRIM) {
dpolicy->io_aware = false;
@@ -1781,7 +1779,7 @@ static int issue_discard_thread(void *data)
struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
wait_queue_head_t *q = &dcc->discard_wait_queue;
struct discard_policy dpolicy;
- unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME;
+ unsigned int wait_ms = dcc->min_discard_issue_time;
int issued;
set_freezable();
@@ -2180,6 +2178,10 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
atomic_set(&dcc->discard_cmd_cnt, 0);
dcc->nr_discards = 0;
dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg;
+ dcc->max_discard_request = DEF_MAX_DISCARD_REQUEST;
+ dcc->min_discard_issue_time = DEF_MIN_DISCARD_ISSUE_TIME;
+ dcc->mid_discard_issue_time = DEF_MID_DISCARD_ISSUE_TIME;
+ dcc->max_discard_issue_time = DEF_MAX_DISCARD_ISSUE_TIME;
dcc->undiscard_blks = 0;
dcc->next_pos = 0;
dcc->root = RB_ROOT_CACHED;
@@ -2821,7 +2823,7 @@ static void __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi)
if (!sbi->am.atgc_enabled)
return;
- down_read(&SM_I(sbi)->curseg_lock);
+ f2fs_down_read(&SM_I(sbi)->curseg_lock);
mutex_lock(&curseg->curseg_mutex);
down_write(&SIT_I(sbi)->sentry_lock);
@@ -2831,7 +2833,7 @@ static void __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi)
up_write(&SIT_I(sbi)->sentry_lock);
mutex_unlock(&curseg->curseg_mutex);
- up_read(&SM_I(sbi)->curseg_lock);
+ f2fs_up_read(&SM_I(sbi)->curseg_lock);
}
void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi)
@@ -2982,7 +2984,7 @@ void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
struct curseg_info *curseg = CURSEG_I(sbi, type);
unsigned int segno;
- down_read(&SM_I(sbi)->curseg_lock);
+ f2fs_down_read(&SM_I(sbi)->curseg_lock);
mutex_lock(&curseg->curseg_mutex);
down_write(&SIT_I(sbi)->sentry_lock);
@@ -3006,7 +3008,7 @@ unlock:
type, segno, curseg->segno);
mutex_unlock(&curseg->curseg_mutex);
- up_read(&SM_I(sbi)->curseg_lock);
+ f2fs_up_read(&SM_I(sbi)->curseg_lock);
}
static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type,
@@ -3038,23 +3040,23 @@ static void __allocate_new_section(struct f2fs_sb_info *sbi,
void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force)
{
- down_read(&SM_I(sbi)->curseg_lock);
+ f2fs_down_read(&SM_I(sbi)->curseg_lock);
down_write(&SIT_I(sbi)->sentry_lock);
__allocate_new_section(sbi, type, force);
up_write(&SIT_I(sbi)->sentry_lock);
- up_read(&SM_I(sbi)->curseg_lock);
+ f2fs_up_read(&SM_I(sbi)->curseg_lock);
}
void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
{
int i;
- down_read(&SM_I(sbi)->curseg_lock);
+ f2fs_down_read(&SM_I(sbi)->curseg_lock);
down_write(&SIT_I(sbi)->sentry_lock);
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++)
__allocate_new_segment(sbi, i, false, false);
up_write(&SIT_I(sbi)->sentry_lock);
- up_read(&SM_I(sbi)->curseg_lock);
+ f2fs_up_read(&SM_I(sbi)->curseg_lock);
}
static const struct segment_allocation default_salloc_ops = {
@@ -3133,7 +3135,7 @@ next:
blk_finish_plug(&plug);
mutex_unlock(&dcc->cmd_lock);
trimmed += __wait_all_discard_cmd(sbi, NULL);
- congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
+ f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
goto next;
}
skip:
@@ -3192,9 +3194,9 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
if (sbi->discard_blks == 0)
goto out;
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
err = f2fs_write_checkpoint(sbi, &cpc);
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->gc_lock);
if (err)
goto out;
@@ -3241,101 +3243,6 @@ int f2fs_rw_hint_to_seg_type(enum rw_hint hint)
}
}
-/* This returns write hints for each segment type. This hints will be
- * passed down to block layer. There are mapping tables which depend on
- * the mount option 'whint_mode'.
- *
- * 1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET.
- *
- * 2) whint_mode=user-based. F2FS tries to pass down hints given by users.
- *
- * User F2FS Block
- * ---- ---- -----
- * META WRITE_LIFE_NOT_SET
- * HOT_NODE "
- * WARM_NODE "
- * COLD_NODE "
- * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME
- * extension list " "
- *
- * -- buffered io
- * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
- * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
- * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
- * WRITE_LIFE_NONE " "
- * WRITE_LIFE_MEDIUM " "
- * WRITE_LIFE_LONG " "
- *
- * -- direct io
- * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
- * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
- * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
- * WRITE_LIFE_NONE " WRITE_LIFE_NONE
- * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM
- * WRITE_LIFE_LONG " WRITE_LIFE_LONG
- *
- * 3) whint_mode=fs-based. F2FS passes down hints with its policy.
- *
- * User F2FS Block
- * ---- ---- -----
- * META WRITE_LIFE_MEDIUM;
- * HOT_NODE WRITE_LIFE_NOT_SET
- * WARM_NODE "
- * COLD_NODE WRITE_LIFE_NONE
- * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME
- * extension list " "
- *
- * -- buffered io
- * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
- * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
- * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_LONG
- * WRITE_LIFE_NONE " "
- * WRITE_LIFE_MEDIUM " "
- * WRITE_LIFE_LONG " "
- *
- * -- direct io
- * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
- * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
- * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
- * WRITE_LIFE_NONE " WRITE_LIFE_NONE
- * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM
- * WRITE_LIFE_LONG " WRITE_LIFE_LONG
- */
-
-enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi,
- enum page_type type, enum temp_type temp)
-{
- if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) {
- if (type == DATA) {
- if (temp == WARM)
- return WRITE_LIFE_NOT_SET;
- else if (temp == HOT)
- return WRITE_LIFE_SHORT;
- else if (temp == COLD)
- return WRITE_LIFE_EXTREME;
- } else {
- return WRITE_LIFE_NOT_SET;
- }
- } else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) {
- if (type == DATA) {
- if (temp == WARM)
- return WRITE_LIFE_LONG;
- else if (temp == HOT)
- return WRITE_LIFE_SHORT;
- else if (temp == COLD)
- return WRITE_LIFE_EXTREME;
- } else if (type == NODE) {
- if (temp == WARM || temp == HOT)
- return WRITE_LIFE_NOT_SET;
- else if (temp == COLD)
- return WRITE_LIFE_NONE;
- } else if (type == META) {
- return WRITE_LIFE_MEDIUM;
- }
- }
- return WRITE_LIFE_NOT_SET;
-}
-
static int __get_segment_type_2(struct f2fs_io_info *fio)
{
if (fio->type == DATA)
@@ -3431,7 +3338,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
bool from_gc = (type == CURSEG_ALL_DATA_ATGC);
struct seg_entry *se = NULL;
- down_read(&SM_I(sbi)->curseg_lock);
+ f2fs_down_read(&SM_I(sbi)->curseg_lock);
mutex_lock(&curseg->curseg_mutex);
down_write(&sit_i->sentry_lock);
@@ -3514,7 +3421,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
mutex_unlock(&curseg->curseg_mutex);
- up_read(&SM_I(sbi)->curseg_lock);
+ f2fs_up_read(&SM_I(sbi)->curseg_lock);
}
void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino,
@@ -3550,7 +3457,7 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
bool keep_order = (f2fs_lfs_mode(fio->sbi) && type == CURSEG_COLD_DATA);
if (keep_order)
- down_read(&fio->sbi->io_order_lock);
+ f2fs_down_read(&fio->sbi->io_order_lock);
reallocate:
f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
&fio->new_blkaddr, sum, type, fio);
@@ -3570,7 +3477,7 @@ reallocate:
f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1);
if (keep_order)
- up_read(&fio->sbi->io_order_lock);
+ f2fs_up_read(&fio->sbi->io_order_lock);
}
void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page,
@@ -3705,7 +3612,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
se = get_seg_entry(sbi, segno);
type = se->type;
- down_write(&SM_I(sbi)->curseg_lock);
+ f2fs_down_write(&SM_I(sbi)->curseg_lock);
if (!recover_curseg) {
/* for recovery flow */
@@ -3774,7 +3681,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
up_write(&sit_i->sentry_lock);
mutex_unlock(&curseg->curseg_mutex);
- up_write(&SM_I(sbi)->curseg_lock);
+ f2fs_up_write(&SM_I(sbi)->curseg_lock);
}
void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
@@ -4789,6 +4696,13 @@ static int sanity_check_curseg(struct f2fs_sb_info *sbi)
sanity_check_seg_type(sbi, curseg->seg_type);
+ if (curseg->alloc_type != LFS && curseg->alloc_type != SSR) {
+ f2fs_err(sbi,
+ "Current segment has invalid alloc_type:%d",
+ curseg->alloc_type);
+ return -EFSCORRUPTED;
+ }
+
if (f2fs_test_bit(blkofs, se->cur_valid_map))
goto out;
@@ -5258,7 +5172,7 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi)
INIT_LIST_HEAD(&sm_info->sit_entry_set);
- init_rwsem(&sm_info->curseg_lock);
+ init_f2fs_rwsem(&sm_info->curseg_lock);
if (!f2fs_readonly(sbi->sb)) {
err = f2fs_create_flush_cmd_control(sbi);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 0291cd55cf09..5c94caf0c0a1 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -651,7 +651,9 @@ static inline int utilization(struct f2fs_sb_info *sbi)
* pages over min_fsync_blocks. (=default option)
* F2FS_IPU_ASYNC - do IPU given by asynchronous write requests.
* F2FS_IPU_NOCACHE - disable IPU bio cache.
- * F2FS_IPUT_DISABLE - disable IPU. (=default option in LFS mode)
+ * F2FS_IPU_HONOR_OPU_WRITE - use OPU write prior to IPU write if inode has
+ * FI_OPU_WRITE flag.
+ * F2FS_IPU_DISABLE - disable IPU. (=default option in LFS mode)
*/
#define DEF_MIN_IPU_UTIL 70
#define DEF_MIN_FSYNC_BLOCKS 8
@@ -667,6 +669,7 @@ enum {
F2FS_IPU_FSYNC,
F2FS_IPU_ASYNC,
F2FS_IPU_NOCACHE,
+ F2FS_IPU_HONOR_OPU_WRITE,
};
static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi,
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index baefd398ec1a..4368f90571bd 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -138,7 +138,6 @@ enum {
Opt_jqfmt_vfsold,
Opt_jqfmt_vfsv0,
Opt_jqfmt_vfsv1,
- Opt_whint,
Opt_alloc,
Opt_fsync,
Opt_test_dummy_encryption,
@@ -214,7 +213,6 @@ static match_table_t f2fs_tokens = {
{Opt_jqfmt_vfsold, "jqfmt=vfsold"},
{Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
{Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
- {Opt_whint, "whint_mode=%s"},
{Opt_alloc, "alloc_mode=%s"},
{Opt_fsync, "fsync_mode=%s"},
{Opt_test_dummy_encryption, "test_dummy_encryption=%s"},
@@ -975,22 +973,6 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
f2fs_info(sbi, "quota operations not supported");
break;
#endif
- case Opt_whint:
- name = match_strdup(&args[0]);
- if (!name)
- return -ENOMEM;
- if (!strcmp(name, "user-based")) {
- F2FS_OPTION(sbi).whint_mode = WHINT_MODE_USER;
- } else if (!strcmp(name, "off")) {
- F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF;
- } else if (!strcmp(name, "fs-based")) {
- F2FS_OPTION(sbi).whint_mode = WHINT_MODE_FS;
- } else {
- kfree(name);
- return -EINVAL;
- }
- kfree(name);
- break;
case Opt_alloc:
name = match_strdup(&args[0]);
if (!name)
@@ -1328,12 +1310,6 @@ default_check:
return -EINVAL;
}
- /* Not pass down write hints if the number of active logs is lesser
- * than NR_CURSEG_PERSIST_TYPE.
- */
- if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_PERSIST_TYPE)
- F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF;
-
if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) {
f2fs_err(sbi, "Allow to mount readonly mode only");
return -EROFS;
@@ -1345,8 +1321,12 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
{
struct f2fs_inode_info *fi;
- fi = f2fs_kmem_cache_alloc(f2fs_inode_cachep,
- GFP_F2FS_ZERO, false, F2FS_SB(sb));
+ if (time_to_inject(F2FS_SB(sb), FAULT_SLAB_ALLOC)) {
+ f2fs_show_injection_info(F2FS_SB(sb), FAULT_SLAB_ALLOC);
+ return NULL;
+ }
+
+ fi = alloc_inode_sb(sb, f2fs_inode_cachep, GFP_F2FS_ZERO);
if (!fi)
return NULL;
@@ -1355,16 +1335,16 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
/* Initialize f2fs-specific inode info */
atomic_set(&fi->dirty_pages, 0);
atomic_set(&fi->i_compr_blocks, 0);
- init_rwsem(&fi->i_sem);
+ init_f2fs_rwsem(&fi->i_sem);
spin_lock_init(&fi->i_size_lock);
INIT_LIST_HEAD(&fi->dirty_list);
INIT_LIST_HEAD(&fi->gdirty_list);
INIT_LIST_HEAD(&fi->inmem_ilist);
INIT_LIST_HEAD(&fi->inmem_pages);
mutex_init(&fi->inmem_lock);
- init_rwsem(&fi->i_gc_rwsem[READ]);
- init_rwsem(&fi->i_gc_rwsem[WRITE]);
- init_rwsem(&fi->i_xattr_sem);
+ init_f2fs_rwsem(&fi->i_gc_rwsem[READ]);
+ init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]);
+ init_f2fs_rwsem(&fi->i_xattr_sem);
/* Will be used by directory only */
fi->i_dir_level = F2FS_SB(sb)->dir_level;
@@ -1501,8 +1481,9 @@ static void f2fs_free_inode(struct inode *inode)
static void destroy_percpu_info(struct f2fs_sb_info *sbi)
{
- percpu_counter_destroy(&sbi->alloc_valid_block_count);
percpu_counter_destroy(&sbi->total_valid_inode_count);
+ percpu_counter_destroy(&sbi->rf_node_block_count);
+ percpu_counter_destroy(&sbi->alloc_valid_block_count);
}
static void destroy_device_list(struct f2fs_sb_info *sbi)
@@ -1662,11 +1643,15 @@ static int f2fs_freeze(struct super_block *sb)
/* ensure no checkpoint required */
if (!llist_empty(&F2FS_SB(sb)->cprc_info.issue_list))
return -EINVAL;
+
+ /* to avoid deadlock on f2fs_evict_inode->SB_FREEZE_FS */
+ set_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING);
return 0;
}
static int f2fs_unfreeze(struct super_block *sb)
{
+ clear_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING);
return 0;
}
@@ -1969,10 +1954,6 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",prjquota");
#endif
f2fs_show_quota_options(seq, sbi->sb);
- if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER)
- seq_printf(seq, ",whint_mode=%s", "user-based");
- else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS)
- seq_printf(seq, ",whint_mode=%s", "fs-based");
fscrypt_show_test_dummy_encryption(seq, ',', sbi->sb);
@@ -2024,7 +2005,6 @@ static void default_options(struct f2fs_sb_info *sbi)
F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE;
F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
- F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF;
F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT;
F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX;
F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID);
@@ -2075,6 +2055,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
{
unsigned int s_flags = sbi->sb->s_flags;
struct cp_control cpc;
+ unsigned int gc_mode;
int err = 0;
int ret;
block_t unusable;
@@ -2087,8 +2068,11 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
f2fs_update_time(sbi, DISABLE_TIME);
+ gc_mode = sbi->gc_mode;
+ sbi->gc_mode = GC_URGENT_HIGH;
+
while (!f2fs_time_over(sbi, DISABLE_TIME)) {
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
err = f2fs_gc(sbi, true, false, false, NULL_SEGNO);
if (err == -ENODATA) {
err = 0;
@@ -2110,7 +2094,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
goto restore_flag;
}
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
cpc.reason = CP_PAUSE;
set_sbi_flag(sbi, SBI_CP_DISABLED);
err = f2fs_write_checkpoint(sbi, &cpc);
@@ -2122,8 +2106,9 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
spin_unlock(&sbi->stat_lock);
out_unlock:
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->gc_lock);
restore_flag:
+ sbi->gc_mode = gc_mode;
sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */
return err;
}
@@ -2135,19 +2120,18 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi)
/* we should flush all the data to keep data consistency */
do {
sync_inodes_sb(sbi->sb);
- cond_resched();
- congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
+ f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
} while (get_pages(sbi, F2FS_DIRTY_DATA) && retry--);
if (unlikely(retry < 0))
f2fs_warn(sbi, "checkpoint=enable has some unwritten data.");
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
f2fs_dirty_to_prefree(sbi);
clear_sbi_flag(sbi, SBI_CP_DISABLED);
set_sbi_flag(sbi, SBI_IS_DIRTY);
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->gc_lock);
f2fs_sync_fs(sbi->sb, 1);
}
@@ -2301,8 +2285,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
need_stop_gc = true;
}
- if (*flags & SB_RDONLY ||
- F2FS_OPTION(sbi).whint_mode != org_mount_opt.whint_mode) {
+ if (*flags & SB_RDONLY) {
sync_inodes_sb(sb);
set_sbi_flag(sbi, SBI_IS_DIRTY);
@@ -2504,8 +2487,7 @@ retry:
&page, &fsdata);
if (unlikely(err)) {
if (err == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC,
- DEFAULT_IO_TIMEOUT);
+ f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
goto retry;
}
set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
@@ -2688,7 +2670,7 @@ int f2fs_quota_sync(struct super_block *sb, int type)
struct f2fs_sb_info *sbi = F2FS_SB(sb);
struct quota_info *dqopt = sb_dqopt(sb);
int cnt;
- int ret;
+ int ret = 0;
/*
* Now when everything is written we can discard the pagecache so
@@ -2699,26 +2681,26 @@ int f2fs_quota_sync(struct super_block *sb, int type)
if (type != -1 && cnt != type)
continue;
- if (!sb_has_quota_active(sb, type))
- return 0;
+ if (!sb_has_quota_active(sb, cnt))
+ continue;
inode_lock(dqopt->files[cnt]);
/*
* do_quotactl
* f2fs_quota_sync
- * down_read(quota_sem)
+ * f2fs_down_read(quota_sem)
* dquot_writeback_dquots()
* f2fs_dquot_commit
* block_operation
- * down_read(quota_sem)
+ * f2fs_down_read(quota_sem)
*/
f2fs_lock_op(sbi);
- down_read(&sbi->quota_sem);
+ f2fs_down_read(&sbi->quota_sem);
ret = f2fs_quota_sync_file(sbi, cnt);
- up_read(&sbi->quota_sem);
+ f2fs_up_read(&sbi->quota_sem);
f2fs_unlock_op(sbi);
inode_unlock(dqopt->files[cnt]);
@@ -2843,11 +2825,11 @@ static int f2fs_dquot_commit(struct dquot *dquot)
struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb);
int ret;
- down_read_nested(&sbi->quota_sem, SINGLE_DEPTH_NESTING);
+ f2fs_down_read_nested(&sbi->quota_sem, SINGLE_DEPTH_NESTING);
ret = dquot_commit(dquot);
if (ret < 0)
set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
- up_read(&sbi->quota_sem);
+ f2fs_up_read(&sbi->quota_sem);
return ret;
}
@@ -2856,11 +2838,11 @@ static int f2fs_dquot_acquire(struct dquot *dquot)
struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb);
int ret;
- down_read(&sbi->quota_sem);
+ f2fs_down_read(&sbi->quota_sem);
ret = dquot_acquire(dquot);
if (ret < 0)
set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
- up_read(&sbi->quota_sem);
+ f2fs_up_read(&sbi->quota_sem);
return ret;
}
@@ -3574,6 +3556,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
F2FS_NODE_INO(sbi) = le32_to_cpu(raw_super->node_ino);
F2FS_META_INO(sbi) = le32_to_cpu(raw_super->meta_ino);
sbi->cur_victim_sec = NULL_SECNO;
+ sbi->gc_mode = GC_NORMAL;
sbi->next_victim_seg[BG_GC] = NULL_SEGNO;
sbi->next_victim_seg[FG_GC] = NULL_SEGNO;
sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH;
@@ -3601,14 +3584,14 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
INIT_LIST_HEAD(&sbi->s_list);
mutex_init(&sbi->umount_mutex);
- init_rwsem(&sbi->io_order_lock);
+ init_f2fs_rwsem(&sbi->io_order_lock);
spin_lock_init(&sbi->cp_lock);
sbi->dirty_device = 0;
spin_lock_init(&sbi->dev_lock);
- init_rwsem(&sbi->sb_lock);
- init_rwsem(&sbi->pin_sem);
+ init_f2fs_rwsem(&sbi->sb_lock);
+ init_f2fs_rwsem(&sbi->pin_sem);
}
static int init_percpu_info(struct f2fs_sb_info *sbi)
@@ -3619,11 +3602,20 @@ static int init_percpu_info(struct f2fs_sb_info *sbi)
if (err)
return err;
+ err = percpu_counter_init(&sbi->rf_node_block_count, 0, GFP_KERNEL);
+ if (err)
+ goto err_valid_block;
+
err = percpu_counter_init(&sbi->total_valid_inode_count, 0,
GFP_KERNEL);
if (err)
- percpu_counter_destroy(&sbi->alloc_valid_block_count);
+ goto err_node_block;
+ return 0;
+err_node_block:
+ percpu_counter_destroy(&sbi->rf_node_block_count);
+err_valid_block:
+ percpu_counter_destroy(&sbi->alloc_valid_block_count);
return err;
}
@@ -3957,7 +3949,8 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi)
F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE;
if (f2fs_block_unit_discard(sbi))
sm_i->dcc_info->discard_granularity = 1;
- sm_i->ipu_policy = 1 << F2FS_IPU_FORCE;
+ sm_i->ipu_policy = 1 << F2FS_IPU_FORCE |
+ 1 << F2FS_IPU_HONOR_OPU_WRITE;
}
sbi->readdir_ra = 1;
@@ -4067,11 +4060,11 @@ try_onemore:
/* init f2fs-specific super block info */
sbi->valid_super_block = valid_super_block;
- init_rwsem(&sbi->gc_lock);
+ init_f2fs_rwsem(&sbi->gc_lock);
mutex_init(&sbi->writepages);
- init_rwsem(&sbi->cp_global_sem);
- init_rwsem(&sbi->node_write);
- init_rwsem(&sbi->node_change);
+ init_f2fs_rwsem(&sbi->cp_global_sem);
+ init_f2fs_rwsem(&sbi->node_write);
+ init_f2fs_rwsem(&sbi->node_change);
/* disallow all the data/node/meta page writes */
set_sbi_flag(sbi, SBI_POR_DOING);
@@ -4092,18 +4085,18 @@ try_onemore:
}
for (j = HOT; j < n; j++) {
- init_rwsem(&sbi->write_io[i][j].io_rwsem);
+ init_f2fs_rwsem(&sbi->write_io[i][j].io_rwsem);
sbi->write_io[i][j].sbi = sbi;
sbi->write_io[i][j].bio = NULL;
spin_lock_init(&sbi->write_io[i][j].io_lock);
INIT_LIST_HEAD(&sbi->write_io[i][j].io_list);
INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list);
- init_rwsem(&sbi->write_io[i][j].bio_list_lock);
+ init_f2fs_rwsem(&sbi->write_io[i][j].bio_list_lock);
}
}
- init_rwsem(&sbi->cp_rwsem);
- init_rwsem(&sbi->quota_sem);
+ init_f2fs_rwsem(&sbi->cp_rwsem);
+ init_f2fs_rwsem(&sbi->quota_sem);
init_waitqueue_head(&sbi->cp_wait);
init_sb_info(sbi);
@@ -4528,7 +4521,7 @@ static struct file_system_type f2fs_fs_type = {
.name = "f2fs",
.mount = f2fs_mount,
.kill_sb = kill_f2fs_super,
- .fs_flags = FS_REQUIRES_DEV,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("f2fs");
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 8ac506671245..4c50aedd5144 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -41,6 +41,16 @@ enum {
ATGC_INFO, /* struct atgc_management */
};
+static const char *gc_mode_names[MAX_GC_MODE] = {
+ "GC_NORMAL",
+ "GC_IDLE_CB",
+ "GC_IDLE_GREEDY",
+ "GC_IDLE_AT",
+ "GC_URGENT_HIGH",
+ "GC_URGENT_LOW",
+ "GC_URGENT_MID"
+};
+
struct f2fs_attr {
struct attribute attr;
ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *);
@@ -316,8 +326,13 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
return sysfs_emit(buf, "%u\n", sbi->compr_new_inode);
#endif
+ if (!strcmp(a->attr.name, "gc_urgent"))
+ return sysfs_emit(buf, "%s\n",
+ gc_mode_names[sbi->gc_mode]);
+
if (!strcmp(a->attr.name, "gc_segment_mode"))
- return sysfs_emit(buf, "%u\n", sbi->gc_segment_mode);
+ return sysfs_emit(buf, "%s\n",
+ gc_mode_names[sbi->gc_segment_mode]);
if (!strcmp(a->attr.name, "gc_reclaimed_segments")) {
return sysfs_emit(buf, "%u\n",
@@ -363,7 +378,7 @@ static ssize_t __sbi_store(struct f2fs_attr *a,
if (!strlen(name) || strlen(name) >= F2FS_EXTENSION_LEN)
return -EINVAL;
- down_write(&sbi->sb_lock);
+ f2fs_down_write(&sbi->sb_lock);
ret = f2fs_update_extension_list(sbi, name, hot, set);
if (ret)
@@ -373,7 +388,7 @@ static ssize_t __sbi_store(struct f2fs_attr *a,
if (ret)
f2fs_update_extension_list(sbi, name, hot, !set);
out:
- up_write(&sbi->sb_lock);
+ f2fs_up_write(&sbi->sb_lock);
return ret ? ret : count;
}
@@ -468,6 +483,13 @@ out:
}
} else if (t == 2) {
sbi->gc_mode = GC_URGENT_LOW;
+ } else if (t == 3) {
+ sbi->gc_mode = GC_URGENT_MID;
+ if (sbi->gc_thread) {
+ sbi->gc_thread->gc_wake = 1;
+ wake_up_interruptible_all(
+ &sbi->gc_thread->gc_wait_queue_head);
+ }
} else {
return -EINVAL;
}
@@ -481,7 +503,7 @@ out:
} else if (t == GC_IDLE_AT) {
if (!sbi->am.atgc_enabled)
return -EINVAL;
- sbi->gc_mode = GC_AT;
+ sbi->gc_mode = GC_IDLE_AT;
} else {
sbi->gc_mode = GC_NORMAL;
}
@@ -716,6 +738,10 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle, gc_mode);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent, gc_mode);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_request, max_discard_request);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, min_discard_issue_time, min_discard_issue_time);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_issue_time);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity);
F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections);
@@ -728,6 +754,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ssr_sections, min_ssr_sections);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio);
+F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, max_roll_forward_node_blocks, max_rf_node_blocks);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, migration_granularity, migration_granularity);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
@@ -832,6 +859,10 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(reclaim_segments),
ATTR_LIST(main_blkaddr),
ATTR_LIST(max_small_discards),
+ ATTR_LIST(max_discard_request),
+ ATTR_LIST(min_discard_issue_time),
+ ATTR_LIST(mid_discard_issue_time),
+ ATTR_LIST(max_discard_issue_time),
ATTR_LIST(discard_granularity),
ATTR_LIST(pending_discard),
ATTR_LIST(batched_trim_sections),
@@ -847,6 +878,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(ram_thresh),
ATTR_LIST(ra_nid_pages),
ATTR_LIST(dirty_nats_ratio),
+ ATTR_LIST(max_roll_forward_node_blocks),
ATTR_LIST(cp_interval),
ATTR_LIST(idle_interval),
ATTR_LIST(discard_idle_interval),
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index fe5acdccaae1..3d793202cc9f 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -208,7 +208,7 @@ cleanup:
* from re-instantiating cached pages we are truncating (since unlike
* normal file accesses, garbage collection isn't limited by i_size).
*/
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
truncate_inode_pages(inode->i_mapping, inode->i_size);
err2 = f2fs_truncate(inode);
if (err2) {
@@ -216,7 +216,7 @@ cleanup:
err2);
set_sbi_flag(sbi, SBI_NEED_FSCK);
}
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
clear_inode_flag(inode, FI_VERITY_IN_PROGRESS);
return err ?: err2;
}
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 8e5cd9c916ff..c76c15086e5f 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -525,10 +525,10 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
if (len > F2FS_NAME_LEN)
return -ERANGE;
- down_read(&F2FS_I(inode)->i_xattr_sem);
+ f2fs_down_read(&F2FS_I(inode)->i_xattr_sem);
error = lookup_all_xattrs(inode, ipage, index, len, name,
&entry, &base_addr, &base_size, &is_inline);
- up_read(&F2FS_I(inode)->i_xattr_sem);
+ f2fs_up_read(&F2FS_I(inode)->i_xattr_sem);
if (error)
return error;
@@ -562,9 +562,9 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
int error;
size_t rest = buffer_size;
- down_read(&F2FS_I(inode)->i_xattr_sem);
+ f2fs_down_read(&F2FS_I(inode)->i_xattr_sem);
error = read_all_xattrs(inode, NULL, &base_addr);
- up_read(&F2FS_I(inode)->i_xattr_sem);
+ f2fs_up_read(&F2FS_I(inode)->i_xattr_sem);
if (error)
return error;
@@ -786,9 +786,9 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
f2fs_balance_fs(sbi, true);
f2fs_lock_op(sbi);
- down_write(&F2FS_I(inode)->i_xattr_sem);
+ f2fs_down_write(&F2FS_I(inode)->i_xattr_sem);
err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags);
- up_write(&F2FS_I(inode)->i_xattr_sem);
+ f2fs_up_write(&F2FS_I(inode)->i_xattr_sem);
f2fs_unlock_op(sbi);
f2fs_update_time(sbi, REQ_TIME);
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index c4a274285858..249825017da7 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -722,7 +722,7 @@ static int func(struct dir_context *ctx, const char *name, int name_len, \
if (name_len >= sizeof(d1->d_name)) \
name_len = sizeof(d1->d_name) - 1; \
\
- if (put_user(0, d2->d_name) || \
+ if (put_user(0, &d2->d_name[0]) || \
put_user(0, &d2->d_reclen) || \
copy_to_user(d1->d_name, name, name_len) || \
put_user(0, d1->d_name + name_len) || \
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index a6f1c6d426d1..bf6051bdf1d1 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -342,7 +342,8 @@ int fat_block_truncate_page(struct inode *inode, loff_t from)
}
static const struct address_space_operations fat_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = fat_readpage,
.readahead = fat_readahead,
.writepage = fat_writepage,
@@ -745,7 +746,7 @@ static struct kmem_cache *fat_inode_cachep;
static struct inode *fat_alloc_inode(struct super_block *sb)
{
struct msdos_inode_info *ei;
- ei = kmem_cache_alloc(fat_inode_cachep, GFP_NOFS);
+ ei = alloc_inode_sb(sb, fat_inode_cachep, GFP_NOFS);
if (!ei)
return NULL;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 9c6c6a3e2de5..f15d885b9796 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -291,22 +291,6 @@ static long fcntl_rw_hint(struct file *file, unsigned int cmd,
u64 h;
switch (cmd) {
- case F_GET_FILE_RW_HINT:
- h = file_write_hint(file);
- if (copy_to_user(argp, &h, sizeof(*argp)))
- return -EFAULT;
- return 0;
- case F_SET_FILE_RW_HINT:
- if (copy_from_user(&h, argp, sizeof(h)))
- return -EFAULT;
- hint = (enum rw_hint) h;
- if (!rw_hint_valid(hint))
- return -EINVAL;
-
- spin_lock(&file->f_lock);
- file->f_write_hint = hint;
- spin_unlock(&file->f_lock);
- return 0;
case F_GET_RW_HINT:
h = inode->i_write_hint;
if (copy_to_user(argp, &h, sizeof(*argp)))
@@ -431,8 +415,6 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
break;
case F_GET_RW_HINT:
case F_SET_RW_HINT:
- case F_GET_FILE_RW_HINT:
- case F_SET_FILE_RW_HINT:
err = fcntl_rw_hint(filp, cmd, arg);
break;
default:
diff --git a/fs/file.c b/fs/file.c
index 97d212a9b814..ee9317346702 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -87,6 +87,21 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds);
}
+/*
+ * Note how the fdtable bitmap allocations very much have to be a multiple of
+ * BITS_PER_LONG. This is not only because we walk those things in chunks of
+ * 'unsigned long' in some places, but simply because that is how the Linux
+ * kernel bitmaps are defined to work: they are not "bits in an array of bytes",
+ * they are very much "bits in an array of unsigned long".
+ *
+ * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied
+ * by that "1024/sizeof(ptr)" before, we already know there are sufficient
+ * clear low bits. Clang seems to realize that, gcc ends up being confused.
+ *
+ * On a 128-bit machine, the ALIGN() would actually matter. In the meantime,
+ * let's consider it documentation (and maybe a test-case for gcc to improve
+ * its code generation ;)
+ */
static struct fdtable * alloc_fdtable(unsigned int nr)
{
struct fdtable *fdt;
@@ -102,6 +117,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
nr /= (1024 / sizeof(struct file *));
nr = roundup_pow_of_two(nr + 1);
nr *= (1024 / sizeof(struct file *));
+ nr = ALIGN(nr, BITS_PER_LONG);
/*
* Note that this can drive nr *below* what we had passed if sysctl_nr_open
* had been set lower between the check in expand_files() and here. Deal
@@ -269,6 +285,19 @@ static unsigned int count_open_files(struct fdtable *fdt)
return i;
}
+/*
+ * Note that a sane fdtable size always has to be a multiple of
+ * BITS_PER_LONG, since we have bitmaps that are sized by this.
+ *
+ * 'max_fds' will normally already be properly aligned, but it
+ * turns out that in the close_range() -> __close_range() ->
+ * unshare_fd() -> dup_fd() -> sane_fdtable_size() we can end
+ * up having a 'max_fds' value that isn't already aligned.
+ *
+ * Rather than make close_range() have to worry about this,
+ * just make that BITS_PER_LONG alignment be part of a sane
+ * fdtable size. Becuase that's really what it is.
+ */
static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds)
{
unsigned int count;
@@ -276,7 +305,7 @@ static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds)
count = count_open_files(fdt);
if (max_fds < NR_OPEN_DEFAULT)
max_fds = NR_OPEN_DEFAULT;
- return min(count, max_fds);
+ return ALIGN(min(count, max_fds), BITS_PER_LONG);
}
/*
diff --git a/fs/file_table.c b/fs/file_table.c
index 7d2e692b66a9..ada8fe814db9 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -412,6 +412,7 @@ void __fput_sync(struct file *file)
}
EXPORT_SYMBOL(fput);
+EXPORT_SYMBOL(__fput_sync);
void __init files_init(void)
{
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 578a5062706e..22eed5a73ac2 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -124,7 +124,7 @@ static struct inode *vxfs_alloc_inode(struct super_block *sb)
{
struct vxfs_inode_info *vi;
- vi = kmem_cache_alloc(vxfs_inode_cachep, GFP_KERNEL);
+ vi = alloc_inode_sb(sb, vxfs_inode_cachep, GFP_KERNEL);
if (!vi)
return NULL;
inode_init_once(&vi->vfs_inode);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f8d7fe6db989..591fe9cf1659 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -894,43 +894,6 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);
/**
- * inode_congested - test whether an inode is congested
- * @inode: inode to test for congestion (may be NULL)
- * @cong_bits: mask of WB_[a]sync_congested bits to test
- *
- * Tests whether @inode is congested. @cong_bits is the mask of congestion
- * bits to test and the return value is the mask of set bits.
- *
- * If cgroup writeback is enabled for @inode, the congestion state is
- * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg
- * associated with @inode is congested; otherwise, the root wb's congestion
- * state is used.
- *
- * @inode is allowed to be NULL as this function is often called on
- * mapping->host which is NULL for the swapper space.
- */
-int inode_congested(struct inode *inode, int cong_bits)
-{
- /*
- * Once set, ->i_wb never becomes NULL while the inode is alive.
- * Start transaction iff ->i_wb is visible.
- */
- if (inode && inode_to_wb_is_valid(inode)) {
- struct bdi_writeback *wb;
- struct wb_lock_cookie lock_cookie = {};
- bool congested;
-
- wb = unlocked_inode_to_wb_begin(inode, &lock_cookie);
- congested = wb_congested(wb, cong_bits);
- unlocked_inode_to_wb_end(inode, &lock_cookie);
- return congested;
- }
-
- return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
-}
-EXPORT_SYMBOL_GPL(inode_congested);
-
-/**
* wb_split_bdi_pages - split nr_pages to write according to bandwidth
* @wb: target bdi_writeback to split @nr_pages to
* @nr_pages: number of pages to write for the whole bdi
@@ -1903,8 +1866,7 @@ static long writeback_sb_inodes(struct super_block *sb,
* unplug, so get our IOs out the door before we
* give up the CPU.
*/
- if (current->plug)
- blk_flush_plug(current->plug, false);
+ blk_flush_plug(current->plug, false);
cond_resched();
}
@@ -2234,7 +2196,6 @@ void wb_workfn(struct work_struct *work)
long pages_written;
set_worker_desc("flush-%s", bdi_dev_name(wb->bdi));
- current->flags |= PF_SWAPWRITE;
if (likely(!current_is_workqueue_rescuer() ||
!test_bit(WB_registered, &wb->state))) {
@@ -2263,8 +2224,6 @@ void wb_workfn(struct work_struct *work)
wb_wakeup(wb);
else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
wb_wakeup_delayed(wb);
-
- current->flags &= ~PF_SWAPWRITE;
}
/*
@@ -2301,8 +2260,7 @@ void wakeup_flusher_threads(enum wb_reason reason)
/*
* If we are expecting writeback progress we must submit plugged IO.
*/
- if (blk_needs_flush_plug(current))
- blk_flush_plug(current->plug, true);
+ blk_flush_plug(current->plug, true);
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
index 76316c4a3fb7..b313a978ae0a 100644
--- a/fs/fscache/Kconfig
+++ b/fs/fscache/Kconfig
@@ -38,6 +38,3 @@ config FSCACHE_DEBUG
enabled by setting bits in /sys/modules/fscache/parameter/debug.
See Documentation/filesystems/caching/fscache.rst for more information.
-
-config FSCACHE_OLD_API
- bool
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
index 2749933852a9..d645f8b302a2 100644
--- a/fs/fscache/cache.c
+++ b/fs/fscache/cache.c
@@ -214,7 +214,7 @@ void fscache_relinquish_cache(struct fscache_cache *cache)
cache->ops = NULL;
cache->cache_priv = NULL;
- smp_store_release(&cache->state, FSCACHE_CACHE_IS_NOT_PRESENT);
+ fscache_set_cache_state(cache, FSCACHE_CACHE_IS_NOT_PRESENT);
fscache_put_cache(cache, where);
}
EXPORT_SYMBOL(fscache_relinquish_cache);
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 9bb1ab5fe5ed..9d3cf0111709 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -30,7 +30,7 @@ static DEFINE_SPINLOCK(fscache_cookie_lru_lock);
DEFINE_TIMER(fscache_cookie_lru_timer, fscache_cookie_lru_timed_out);
static DECLARE_WORK(fscache_cookie_lru_work, fscache_cookie_lru_worker);
static const char fscache_cookie_states[FSCACHE_COOKIE_STATE__NR] = "-LCAIFUWRD";
-unsigned int fscache_lru_cookie_timeout = 10 * HZ;
+static unsigned int fscache_lru_cookie_timeout = 10 * HZ;
void fscache_print_cookie(struct fscache_cookie *cookie, char prefix)
{
@@ -1069,6 +1069,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie,
}
EXPORT_SYMBOL(__fscache_invalidate);
+#ifdef CONFIG_PROC_FS
/*
* Generate a list of extant cookies in /proc/fs/fscache/cookies
*/
@@ -1145,3 +1146,4 @@ const struct seq_operations fscache_cookies_seq_ops = {
.stop = fscache_cookies_seq_stop,
.show = fscache_cookies_seq_show,
};
+#endif
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index f121c21590dc..1336f517e9b1 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -56,7 +56,9 @@ static inline bool fscache_set_cache_state_maybe(struct fscache_cache *cache,
* cookie.c
*/
extern struct kmem_cache *fscache_cookie_jar;
+#ifdef CONFIG_PROC_FS
extern const struct seq_operations fscache_cookies_seq_ops;
+#endif
extern struct timer_list fscache_cookie_lru_timer;
extern void fscache_print_cookie(struct fscache_cookie *cookie, char prefix);
@@ -71,17 +73,6 @@ static inline void fscache_see_cookie(struct fscache_cookie *cookie,
}
/*
- * io.c
- */
-static inline void fscache_end_operation(struct netfs_cache_resources *cres)
-{
- const struct netfs_cache_ops *ops = fscache_operation_valid(cres);
-
- if (ops)
- ops->end_operation(cres);
-}
-
-/*
* main.c
*/
extern unsigned fscache_debug;
@@ -148,7 +139,9 @@ int fscache_stats_show(struct seq_file *m, void *v);
/*
* volume.c
*/
+#ifdef CONFIG_PROC_FS
extern const struct seq_operations fscache_volumes_seq_ops;
+#endif
struct fscache_volume *fscache_get_volume(struct fscache_volume *volume,
enum fscache_volume_trace where);
diff --git a/fs/fscache/io.c b/fs/fscache/io.c
index 7a769ea57720..3af3b08a9bb3 100644
--- a/fs/fscache/io.c
+++ b/fs/fscache/io.c
@@ -159,27 +159,29 @@ int __fscache_begin_write_operation(struct netfs_cache_resources *cres,
EXPORT_SYMBOL(__fscache_begin_write_operation);
/**
- * fscache_set_page_dirty - Mark page dirty and pin a cache object for writeback
- * @page: The page being dirtied
+ * fscache_dirty_folio - Mark folio dirty and pin a cache object for writeback
+ * @mapping: The mapping the folio belongs to.
+ * @folio: The folio being dirtied.
* @cookie: The cookie referring to the cache object
*
- * Set the dirty flag on a page and pin an in-use cache object in memory when
- * dirtying a page so that writeback can later write to it. This is intended
- * to be called from the filesystem's ->set_page_dirty() method.
+ * Set the dirty flag on a folio and pin an in-use cache object in memory
+ * so that writeback can later write to it. This is intended
+ * to be called from the filesystem's ->dirty_folio() method.
*
- * Returns 1 if PG_dirty was set on the page, 0 otherwise.
+ * Return: true if the dirty flag was set on the folio, false otherwise.
*/
-int fscache_set_page_dirty(struct page *page, struct fscache_cookie *cookie)
+bool fscache_dirty_folio(struct address_space *mapping, struct folio *folio,
+ struct fscache_cookie *cookie)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = mapping->host;
bool need_use = false;
_enter("");
- if (!__set_page_dirty_nobuffers(page))
- return 0;
+ if (!filemap_dirty_folio(mapping, folio))
+ return false;
if (!fscache_cookie_valid(cookie))
- return 1;
+ return true;
if (!(inode->i_state & I_PINNING_FSCACHE_WB)) {
spin_lock(&inode->i_lock);
@@ -192,9 +194,9 @@ int fscache_set_page_dirty(struct page *page, struct fscache_cookie *cookie)
if (need_use)
fscache_use_cookie(cookie, true);
}
- return 1;
+ return true;
}
-EXPORT_SYMBOL(fscache_set_page_dirty);
+EXPORT_SYMBOL(fscache_dirty_folio);
struct fscache_write_request {
struct netfs_cache_resources cache_resources;
@@ -233,8 +235,7 @@ static void fscache_wreq_done(void *priv, ssize_t transferred_or_error,
{
struct fscache_write_request *wreq = priv;
- fscache_clear_page_bits(fscache_cres_cookie(&wreq->cache_resources),
- wreq->mapping, wreq->start, wreq->len,
+ fscache_clear_page_bits(wreq->mapping, wreq->start, wreq->len,
wreq->set_bits);
if (wreq->term_func)
@@ -294,7 +295,7 @@ abandon_end:
abandon_free:
kfree(wreq);
abandon:
- fscache_clear_page_bits(cookie, mapping, start, len, cond);
+ fscache_clear_page_bits(mapping, start, len, cond);
if (term_func)
term_func(term_func_priv, ret, false);
}
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 000d2e5627e9..7cede9a3bc96 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -164,7 +164,6 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
{
unsigned val;
struct fuse_conn *fc;
- struct fuse_mount *fm;
ssize_t ret;
ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
@@ -178,22 +177,6 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
down_read(&fc->killsb);
spin_lock(&fc->bg_lock);
fc->congestion_threshold = val;
-
- /*
- * Get any fuse_mount belonging to this fuse_conn; s_bdi is
- * shared between all of them
- */
-
- if (!list_empty(&fc->mounts)) {
- fm = list_first_entry(&fc->mounts, struct fuse_mount, fc_entry);
- if (fc->num_background < fc->congestion_threshold) {
- clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
- clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
- } else {
- set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
- set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
- }
- }
spin_unlock(&fc->bg_lock);
up_read(&fc->killsb);
fuse_conn_put(fc);
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index 182b24a14804..d7d3a7f06862 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -1326,8 +1326,7 @@ bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi)
static const struct address_space_operations fuse_dax_file_aops = {
.writepages = fuse_dax_writepages,
.direct_IO = noop_direct_IO,
- .set_page_dirty = __set_page_dirty_no_writeback,
- .invalidatepage = noop_invalidatepage,
+ .dirty_folio = noop_dirty_folio,
};
static bool fuse_should_enable_dax(struct inode *inode, unsigned int flags)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 592730fd6e42..0e537e580dc1 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -315,10 +315,6 @@ void fuse_request_end(struct fuse_req *req)
wake_up(&fc->blocked_waitq);
}
- if (fc->num_background == fc->congestion_threshold && fm->sb) {
- clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
- clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
- }
fc->num_background--;
fc->active_background--;
flush_bg_queue(fc);
@@ -540,10 +536,6 @@ static bool fuse_request_queue_background(struct fuse_req *req)
fc->num_background++;
if (fc->num_background == fc->max_background)
fc->blocked = 1;
- if (fc->num_background == fc->congestion_threshold && fm->sb) {
- set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
- set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
- }
list_add_tail(&req->list, &fc->bg_queue);
flush_bg_queue(fc);
queued = true;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 656e921f3506..9ff27b8a9782 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1773,7 +1773,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
/*
* Only call invalidate_inode_pages2() after removing
- * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
+ * FUSE_NOWRITE, otherwise fuse_launder_folio() would deadlock.
*/
if ((is_truncate || !is_wb) &&
S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 0fc150c1c50b..f18d14d5fea1 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -966,6 +966,14 @@ static void fuse_readahead(struct readahead_control *rac)
struct fuse_io_args *ia;
struct fuse_args_pages *ap;
+ if (fc->num_background >= fc->congestion_threshold &&
+ rac->ra->async_size >= readahead_count(rac))
+ /*
+ * Congested and only async pages left, so skip the
+ * rest.
+ */
+ break;
+
nr_pages = readahead_count(rac) - nr_pages;
if (nr_pages > max_pages)
nr_pages = max_pages;
@@ -1959,6 +1967,7 @@ err:
static int fuse_writepage(struct page *page, struct writeback_control *wbc)
{
+ struct fuse_conn *fc = get_fuse_conn(page->mapping->host);
int err;
if (fuse_page_is_writeback(page->mapping->host, page->index)) {
@@ -1974,6 +1983,10 @@ static int fuse_writepage(struct page *page, struct writeback_control *wbc)
return 0;
}
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ fc->num_background >= fc->congestion_threshold)
+ return AOP_WRITEPAGE_ACTIVATE;
+
err = fuse_writepage_locked(page);
unlock_page(page);
@@ -2227,6 +2240,10 @@ static int fuse_writepages(struct address_space *mapping,
if (fuse_is_bad(inode))
goto out;
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ fc->num_background >= fc->congestion_threshold)
+ return 0;
+
data.inode = inode;
data.wpa = NULL;
data.ff = NULL;
@@ -2331,17 +2348,17 @@ unlock:
return copied;
}
-static int fuse_launder_page(struct page *page)
+static int fuse_launder_folio(struct folio *folio)
{
int err = 0;
- if (clear_page_dirty_for_io(page)) {
- struct inode *inode = page->mapping->host;
+ if (folio_clear_dirty_for_io(folio)) {
+ struct inode *inode = folio->mapping->host;
/* Serialize with pending writeback for the same page */
- fuse_wait_on_page_writeback(inode, page->index);
- err = fuse_writepage_locked(page);
+ fuse_wait_on_page_writeback(inode, folio->index);
+ err = fuse_writepage_locked(&folio->page);
if (!err)
- fuse_wait_on_page_writeback(inode, page->index);
+ fuse_wait_on_page_writeback(inode, folio->index);
}
return err;
}
@@ -3162,8 +3179,8 @@ static const struct address_space_operations fuse_file_aops = {
.readahead = fuse_readahead,
.writepage = fuse_writepage,
.writepages = fuse_writepages,
- .launder_page = fuse_launder_page,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .launder_folio = fuse_launder_folio,
+ .dirty_folio = filemap_dirty_folio,
.bmap = fuse_bmap,
.direct_IO = fuse_direct_IO,
.write_begin = fuse_write_begin,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index eac4984cc753..488b460e046f 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -627,7 +627,7 @@ struct fuse_conn {
/** Connection successful. Only set in INIT */
unsigned conn_init:1;
- /** Do readpages asynchronously? Only set in INIT */
+ /** Do readahead asynchronously? Only set in INIT */
unsigned async_read:1;
/** Return an unique read error after abort. Only set in INIT */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 9ee36aa73251..8c0665c5dff8 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -72,7 +72,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
{
struct fuse_inode *fi;
- fi = kmem_cache_alloc(fuse_inode_cachep, GFP_KERNEL);
+ fi = alloc_inode_sb(sb, fuse_inode_cachep, GFP_KERNEL);
if (!fi)
return NULL;
diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c
index df58966bc874..33cde4bbccdc 100644
--- a/fs/fuse/ioctl.c
+++ b/fs/fuse/ioctl.c
@@ -170,7 +170,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
#else
if (flags & FUSE_IOCTL_COMPAT) {
inarg.flags |= FUSE_IOCTL_32BIT;
-#ifdef CONFIG_X86_X32
+#ifdef CONFIG_X86_X32_ABI
if (in_x32_syscall())
inarg.flags |= FUSE_IOCTL_COMPAT_X32;
#endif
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 9d737904d07c..86b7dbb6a0d4 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -8,6 +8,7 @@
#include <linux/dax.h>
#include <linux/pci.h>
#include <linux/pfn_t.h>
+#include <linux/memremap.h>
#include <linux/module.h>
#include <linux/virtio.h>
#include <linux/virtio_fs.h>
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 005e920f5d4a..72c9f31ce724 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -606,18 +606,12 @@ out:
gfs2_trans_end(sdp);
}
-/**
- * jdata_set_page_dirty - Page dirtying function
- * @page: The page to dirty
- *
- * Returns: 1 if it dirtyed the page, or 0 otherwise
- */
-
-static int jdata_set_page_dirty(struct page *page)
+static bool jdata_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
{
if (current->journal_info)
- SetPageChecked(page);
- return __set_page_dirty_buffers(page);
+ folio_set_checked(folio);
+ return block_dirty_folio(mapping, folio);
}
/**
@@ -672,22 +666,23 @@ static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh)
unlock_buffer(bh);
}
-static void gfs2_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void gfs2_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
- unsigned int stop = offset + length;
- int partial_page = (offset || length < PAGE_SIZE);
+ struct gfs2_sbd *sdp = GFS2_SB(folio->mapping->host);
+ size_t stop = offset + length;
+ int partial_page = (offset || length < folio_size(folio));
struct buffer_head *bh, *head;
unsigned long pos = 0;
- BUG_ON(!PageLocked(page));
+ BUG_ON(!folio_test_locked(folio));
if (!partial_page)
- ClearPageChecked(page);
- if (!page_has_buffers(page))
+ folio_clear_checked(folio);
+ head = folio_buffers(folio);
+ if (!head)
goto out;
- bh = head = page_buffers(page);
+ bh = head;
do {
if (pos + bh->b_size > stop)
return;
@@ -699,7 +694,7 @@ static void gfs2_invalidatepage(struct page *page, unsigned int offset,
} while (bh != head);
out:
if (!partial_page)
- try_to_release_page(page, 0);
+ filemap_release_folio(folio, 0);
}
/**
@@ -779,9 +774,9 @@ static const struct address_space_operations gfs2_aops = {
.writepages = gfs2_writepages,
.readpage = gfs2_readpage,
.readahead = gfs2_readahead,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .dirty_folio = filemap_dirty_folio,
.releasepage = iomap_releasepage,
- .invalidatepage = iomap_invalidatepage,
+ .invalidate_folio = iomap_invalidate_folio,
.bmap = gfs2_bmap,
.direct_IO = noop_direct_IO,
.migratepage = iomap_migrate_page,
@@ -794,9 +789,9 @@ static const struct address_space_operations gfs2_jdata_aops = {
.writepages = gfs2_jdata_writepages,
.readpage = gfs2_readpage,
.readahead = gfs2_readahead,
- .set_page_dirty = jdata_set_page_dirty,
+ .dirty_folio = jdata_dirty_folio,
.bmap = gfs2_bmap,
- .invalidatepage = gfs2_invalidatepage,
+ .invalidate_folio = gfs2_invalidate_folio,
.releasepage = gfs2_releasepage,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index d67108489148..39080b2d6cf8 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -606,9 +606,9 @@ out:
return ret;
}
-static inline __be64 *gfs2_indirect_init(struct metapath *mp,
- struct gfs2_glock *gl, unsigned int i,
- unsigned offset, u64 bn)
+static inline void gfs2_indirect_init(struct metapath *mp,
+ struct gfs2_glock *gl, unsigned int i,
+ unsigned offset, u64 bn)
{
__be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
((i > 1) ? sizeof(struct gfs2_meta_header) :
@@ -621,7 +621,6 @@ static inline __be64 *gfs2_indirect_init(struct metapath *mp,
gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
ptr += offset;
*ptr = cpu_to_be64(bn);
- return ptr;
}
enum alloc_state {
@@ -2146,7 +2145,7 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize)
ret = do_shrink(inode, newsize);
out:
- gfs2_rs_delete(ip, NULL);
+ gfs2_rs_delete(ip);
gfs2_qa_put(ip);
return ret;
}
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 8c39a8571b1f..48f01323c37c 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -706,7 +706,7 @@ static int gfs2_release(struct inode *inode, struct file *file)
if (file->f_mode & FMODE_WRITE) {
if (gfs2_rs_active(&ip->i_res))
- gfs2_rs_delete(ip, &inode->i_writecount);
+ gfs2_rs_delete(ip);
gfs2_qa_put(ip);
}
return 0;
@@ -775,8 +775,7 @@ static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i,
size_t *window_size)
{
size_t count = iov_iter_count(i);
- char __user *p;
- int pages = 1;
+ size_t size, offs;
if (likely(!count))
return false;
@@ -785,18 +784,20 @@ static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i,
if (!iter_is_iovec(i))
return false;
+ size = PAGE_SIZE;
+ offs = offset_in_page(i->iov[0].iov_base + i->iov_offset);
if (*prev_count != count || !*window_size) {
- int pages, nr_dirtied;
+ size_t nr_dirtied;
- pages = min_t(int, BIO_MAX_VECS, DIV_ROUND_UP(count, PAGE_SIZE));
+ size = ALIGN(offs + count, PAGE_SIZE);
+ size = min_t(size_t, size, SZ_1M);
nr_dirtied = max(current->nr_dirtied_pause -
- current->nr_dirtied, 1);
- pages = min(pages, nr_dirtied);
+ current->nr_dirtied, 8);
+ size = min(size, nr_dirtied << PAGE_SHIFT);
}
*prev_count = count;
- p = i->iov[0].iov_base + i->iov_offset;
- *window_size = (size_t)PAGE_SIZE * pages - offset_in_page(p);
+ *window_size = size - offs;
return true;
}
@@ -851,9 +852,9 @@ retry_under_glock:
leftover = fault_in_iov_iter_writeable(to, window_size);
gfs2_holder_disallow_demote(gh);
if (leftover != window_size) {
- if (!gfs2_holder_queued(gh))
- goto retry;
- goto retry_under_glock;
+ if (gfs2_holder_queued(gh))
+ goto retry_under_glock;
+ goto retry;
}
}
if (gfs2_holder_queued(gh))
@@ -898,10 +899,10 @@ retry:
ret = gfs2_glock_nq(gh);
if (ret)
goto out_uninit;
-retry_under_glock:
/* Silently fall back to buffered I/O when writing beyond EOF */
if (iocb->ki_pos + iov_iter_count(from) > i_size_read(&ip->i_inode))
goto out;
+retry_under_glock:
from->nofault = true;
ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
@@ -920,9 +921,9 @@ retry_under_glock:
leftover = fault_in_iov_iter_readable(from, window_size);
gfs2_holder_disallow_demote(gh);
if (leftover != window_size) {
- if (!gfs2_holder_queued(gh))
- goto retry;
- goto retry_under_glock;
+ if (gfs2_holder_queued(gh))
+ goto retry_under_glock;
+ goto retry;
}
}
out:
@@ -950,20 +951,19 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
* and retry.
*/
- if (iocb->ki_flags & IOCB_DIRECT) {
- ret = gfs2_file_direct_read(iocb, to, &gh);
- if (likely(ret != -ENOTBLK))
- return ret;
- iocb->ki_flags &= ~IOCB_DIRECT;
- }
+ if (iocb->ki_flags & IOCB_DIRECT)
+ return gfs2_file_direct_read(iocb, to, &gh);
+
+ pagefault_disable();
iocb->ki_flags |= IOCB_NOIO;
ret = generic_file_read_iter(iocb, to);
iocb->ki_flags &= ~IOCB_NOIO;
+ pagefault_enable();
if (ret >= 0) {
if (!iov_iter_count(to))
return ret;
written = ret;
- } else {
+ } else if (ret != -EFAULT) {
if (ret != -EAGAIN)
return ret;
if (iocb->ki_flags & IOCB_NOWAIT)
@@ -989,12 +989,9 @@ retry_under_glock:
leftover = fault_in_iov_iter_writeable(to, window_size);
gfs2_holder_disallow_demote(&gh);
if (leftover != window_size) {
- if (!gfs2_holder_queued(&gh)) {
- if (written)
- goto out_uninit;
- goto retry;
- }
- goto retry_under_glock;
+ if (gfs2_holder_queued(&gh))
+ goto retry_under_glock;
+ goto retry;
}
}
if (gfs2_holder_queued(&gh))
@@ -1068,12 +1065,9 @@ retry_under_glock:
gfs2_holder_disallow_demote(gh);
if (leftover != window_size) {
from->count = min(from->count, window_size - leftover);
- if (!gfs2_holder_queued(gh)) {
- if (read)
- goto out_uninit;
- goto retry;
- }
- goto retry_under_glock;
+ if (gfs2_holder_queued(gh))
+ goto retry_under_glock;
+ goto retry;
}
}
out_unlock:
@@ -1083,6 +1077,7 @@ out_uninit:
gfs2_holder_uninit(gh);
if (statfs_gh)
kfree(statfs_gh);
+ from->count = orig_count - read;
return read ? read : ret;
}
@@ -1497,7 +1492,6 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
if (error != GLR_TRYFAILED)
break;
fl_gh->gh_flags = LM_FLAG_TRY | GL_EXACT;
- fl_gh->gh_error = 0;
msleep(sleeptime);
}
if (error) {
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 6b23399eaee0..630c6550eacf 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -542,7 +542,7 @@ restart:
* some reason. If this holder is the head of the list, it
* means we have a blocked holder at the head, so return 1.
*/
- if (gh->gh_list.prev == &gl->gl_holders)
+ if (list_is_first(&gh->gh_list, &gl->gl_holders))
return 1;
do_error(gl, 0);
break;
@@ -669,6 +669,8 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
/* Check for state != intended state */
if (unlikely(state != gl->gl_target)) {
+ if (gh && (ret & LM_OUT_CANCELED))
+ gfs2_holder_wake(gh);
if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) {
/* move to back of queue and try next entry */
if (ret & LM_OUT_CANCELED) {
@@ -1259,7 +1261,6 @@ void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags,
gh->gh_owner_pid = get_pid(task_pid(current));
gh->gh_state = state;
gh->gh_flags = flags;
- gh->gh_error = 0;
gh->gh_iflags = 0;
gfs2_glock_hold(gl);
}
@@ -1565,6 +1566,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
if (test_bit(GLF_LRU, &gl->gl_flags))
gfs2_glock_remove_from_lru(gl);
+ gh->gh_error = 0;
spin_lock(&gl->gl_lockref.lock);
add_to_queue(gh);
if (unlikely((LM_FLAG_NOEXP & gh->gh_flags) &&
@@ -1691,6 +1693,14 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
struct gfs2_glock *gl = gh->gh_gl;
spin_lock(&gl->gl_lockref.lock);
+ if (list_is_first(&gh->gh_list, &gl->gl_holders) &&
+ !test_bit(HIF_HOLDER, &gh->gh_iflags)) {
+ spin_unlock(&gl->gl_lockref.lock);
+ gl->gl_name.ln_sbd->sd_lockstruct.ls_ops->lm_cancel(gl);
+ wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE);
+ spin_lock(&gl->gl_lockref.lock);
+ }
+
__gfs2_glock_dq(gh);
spin_unlock(&gl->gl_lockref.lock);
}
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 89905f4f29bb..c8ec876f33ea 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -131,7 +131,21 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct gfs2_glock *io_gl;
- error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
+ error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE,
+ &ip->i_gl);
+ if (unlikely(error))
+ goto fail;
+
+ error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE,
+ &io_gl);
+ if (unlikely(error))
+ goto fail;
+
+ if (blktype != GFS2_BLKST_UNLINKED)
+ gfs2_cancel_delete_work(io_gl);
+ error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT,
+ &ip->i_iopen_gh);
+ gfs2_glock_put(io_gl);
if (unlikely(error))
goto fail;
@@ -161,16 +175,6 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
set_bit(GLF_INSTANTIATE_NEEDED, &ip->i_gl->gl_flags);
- error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
- if (unlikely(error))
- goto fail;
- if (blktype != GFS2_BLKST_UNLINKED)
- gfs2_cancel_delete_work(io_gl);
- error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
- gfs2_glock_put(io_gl);
- if (unlikely(error))
- goto fail;
-
/* Lowest possible timestamp; will be overwritten in gfs2_dinode_in. */
inode->i_atime.tv_sec = 1LL << (8 * sizeof(inode->i_atime.tv_sec) - 1);
inode->i_atime.tv_nsec = 0;
@@ -716,13 +720,17 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
error = insert_inode_locked4(inode, ip->i_no_addr, iget_test, &ip->i_no_addr);
BUG_ON(error);
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
+ error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
if (error)
goto fail_gunlock2;
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
+ if (error)
+ goto fail_gunlock3;
+
error = gfs2_trans_begin(sdp, blocks, 0);
if (error)
- goto fail_gunlock2;
+ goto fail_gunlock3;
if (blocks > 1) {
ip->i_eattr = ip->i_no_addr + 1;
@@ -731,10 +739,6 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
init_dinode(dip, ip, symname);
gfs2_trans_end(sdp);
- error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
- if (error)
- goto fail_gunlock2;
-
glock_set_object(ip->i_gl, ip);
glock_set_object(io_gl, ip);
gfs2_set_iop(inode);
@@ -745,14 +749,14 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (default_acl) {
error = __gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
if (error)
- goto fail_gunlock3;
+ goto fail_gunlock4;
posix_acl_release(default_acl);
default_acl = NULL;
}
if (acl) {
error = __gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS);
if (error)
- goto fail_gunlock3;
+ goto fail_gunlock4;
posix_acl_release(acl);
acl = NULL;
}
@@ -760,11 +764,11 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
error = security_inode_init_security(&ip->i_inode, &dip->i_inode, name,
&gfs2_initxattrs, NULL);
if (error)
- goto fail_gunlock3;
+ goto fail_gunlock4;
error = link_dinode(dip, name, ip, &da);
if (error)
- goto fail_gunlock3;
+ goto fail_gunlock4;
mark_inode_dirty(inode);
d_instantiate(dentry, inode);
@@ -782,9 +786,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
unlock_new_inode(inode);
return error;
-fail_gunlock3:
+fail_gunlock4:
glock_clear_object(ip->i_gl, ip);
glock_clear_object(io_gl, ip);
+fail_gunlock3:
gfs2_glock_dq_uninit(&ip->i_iopen_gh);
fail_gunlock2:
gfs2_glock_put(io_gl);
@@ -793,7 +798,7 @@ fail_free_inode:
if (free_vfs_inode) /* else evict will do the put for us */
gfs2_glock_put(ip->i_gl);
}
- gfs2_rs_delete(ip, NULL);
+ gfs2_rs_deltree(&ip->i_res);
gfs2_qa_put(ip);
fail_free_acls:
posix_acl_release(default_acl);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 50578f881e6d..2559a79cf14b 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -261,6 +261,7 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
int req;
u32 lkf;
char strname[GDLM_STRNAME_BYTES] = "";
+ int error;
req = make_mode(gl->gl_name.ln_sbd, req_state);
lkf = make_flags(gl, flags, req);
@@ -279,8 +280,14 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
* Submit the actual lock request.
*/
- return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, strname,
+again:
+ error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, strname,
GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
+ if (error == -EBUSY) {
+ msleep(20);
+ goto again;
+ }
+ return error;
}
static void gdlm_put_lock(struct gfs2_glock *gl)
@@ -312,8 +319,14 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
return;
}
+again:
error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK,
NULL, gl);
+ if (error == -EBUSY) {
+ msleep(20);
+ goto again;
+ }
+
if (error) {
fs_err(sdp, "gdlm_unlock %x,%llx err=%d\n",
gl->gl_name.ln_type,
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index ca0bb3a73912..6ba51cbb94cf 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -265,10 +265,9 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno,
bio_end_io_t *end_io)
{
struct super_block *sb = sdp->sd_vfs;
- struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS);
+ struct bio *bio = bio_alloc(sb->s_bdev, BIO_MAX_VECS, 0, GFP_NOIO);
bio->bi_iter.bi_sector = blkno << sdp->sd_fsb2bb_shift;
- bio_set_dev(bio, sb->s_bdev);
bio->bi_end_io = end_io;
bio->bi_private = sdp;
@@ -489,11 +488,9 @@ static struct bio *gfs2_chain_bio(struct bio *prev, unsigned int nr_iovecs)
{
struct bio *new;
- new = bio_alloc(GFP_NOIO, nr_iovecs);
- bio_copy_dev(new, prev);
+ new = bio_alloc(prev->bi_bdev, nr_iovecs, prev->bi_opf, GFP_NOIO);
+ bio_clone_blkg_association(new, prev);
new->bi_iter.bi_sector = bio_end_sector(prev);
- new->bi_opf = prev->bi_opf;
- new->bi_write_hint = prev->bi_write_hint;
bio_chain(new, prev);
submit_bio(prev);
return new;
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 72d30a682ece..d8bd1d48bd78 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -89,13 +89,15 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
}
const struct address_space_operations gfs2_meta_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.writepage = gfs2_aspace_writepage,
.releasepage = gfs2_releasepage,
};
const struct address_space_operations gfs2_rgrp_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.writepage = gfs2_aspace_writepage,
.releasepage = gfs2_releasepage,
};
@@ -222,9 +224,8 @@ static void gfs2_submit_bhs(int op, int op_flags, struct buffer_head *bhs[],
struct buffer_head *bh = *bhs;
struct bio *bio;
- bio = bio_alloc(GFP_NOIO, num);
+ bio = bio_alloc(bh->b_bdev, num, op | op_flags, GFP_NOIO);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
- bio_set_dev(bio, bh->b_bdev);
while (num > 0) {
bh = *bhs;
if (!bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh))) {
@@ -235,7 +236,6 @@ static void gfs2_submit_bhs(int op, int op_flags, struct buffer_head *bhs[],
num--;
}
bio->bi_end_io = gfs2_meta_read_endio;
- bio_set_op_attrs(bio, op, op_flags);
submit_bio(bio);
}
}
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 7f8410d8fdc1..c9b423c874a3 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -251,14 +251,12 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
ClearPageDirty(page);
lock_page(page);
- bio = bio_alloc(GFP_NOFS, 1);
+ bio = bio_alloc(sb->s_bdev, 1, REQ_OP_READ | REQ_META, GFP_NOFS);
bio->bi_iter.bi_sector = sector * (sb->s_blocksize >> 9);
- bio_set_dev(bio, sb->s_bdev);
bio_add_page(bio, page, PAGE_SIZE, 0);
bio->bi_end_io = end_bio_io_page;
bio->bi_private = page;
- bio_set_op_attrs(bio, REQ_OP_READ, REQ_META);
submit_bio(bio);
wait_on_page_locked(page);
bio_put(bio);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 0fb3c01bc557..801ad9f4f2be 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -680,13 +680,14 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
/**
* gfs2_rs_delete - delete a multi-block reservation
* @ip: The inode for this reservation
- * @wcount: The inode's write count, or NULL
*
*/
-void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount)
+void gfs2_rs_delete(struct gfs2_inode *ip)
{
+ struct inode *inode = &ip->i_inode;
+
down_write(&ip->i_rw_mutex);
- if ((wcount == NULL) || (atomic_read(wcount) <= 1))
+ if (atomic_read(&inode->i_writecount) <= 1)
gfs2_rs_deltree(&ip->i_res);
up_write(&ip->i_rw_mutex);
}
@@ -922,15 +923,15 @@ static int read_rindex_entry(struct gfs2_inode *ip)
spin_lock_init(&rgd->rd_rsspin);
mutex_init(&rgd->rd_mutex);
- error = compute_bitstructs(rgd);
- if (error)
- goto fail;
-
error = gfs2_glock_get(sdp, rgd->rd_addr,
&gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
if (error)
goto fail;
+ error = compute_bitstructs(rgd);
+ if (error)
+ goto fail_glock;
+
rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
rgd->rd_flags &= ~GFS2_RDF_PREFERRED;
if (rgd->rd_data > sdp->sd_max_rg_data)
@@ -944,6 +945,7 @@ static int read_rindex_entry(struct gfs2_inode *ip)
}
error = 0; /* someone else read in the rgrp; free it and ignore it */
+fail_glock:
gfs2_glock_put(rgd->rd_gl);
fail:
@@ -1415,7 +1417,8 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
start = r.start >> bs_shift;
end = start + (r.len >> bs_shift);
- minlen = max_t(u64, r.minlen,
+ minlen = max_t(u64, r.minlen, sdp->sd_sb.sb_bsize);
+ minlen = max_t(u64, minlen,
q->limits.discard_granularity) >> bs_shift;
if (end <= start || minlen > sdp->sd_max_rg_data)
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 3e2ca1fb4305..46dd94e9e085 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -45,7 +45,7 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
bool dinode, u64 *generation);
extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
-extern void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount);
+extern void gfs2_rs_delete(struct gfs2_inode *ip);
extern void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
u64 bstart, u32 blen, int meta);
extern void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 64c67090f503..bdb773e5c88f 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1396,7 +1396,7 @@ out:
truncate_inode_pages_final(&inode->i_data);
if (ip->i_qadata)
gfs2_assert_warn(sdp, ip->i_qadata->qa_ref == 0);
- gfs2_rs_delete(ip, NULL);
+ gfs2_rs_deltree(&ip->i_res);
gfs2_ordered_del_inode(ip);
clear_inode(inode);
gfs2_dir_hash_inval(ip);
@@ -1425,7 +1425,7 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
{
struct gfs2_inode *ip;
- ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
+ ip = alloc_inode_sb(sb, gfs2_inode_cachep, GFP_KERNEL);
if (!ip)
return NULL;
ip->i_flags = 0;
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index a6002b2d146d..d87ea98cf535 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -15,7 +15,7 @@
#include <linux/kobject.h>
#include <linux/uaccess.h>
#include <linux/gfs2_ondisk.h>
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
#include "gfs2.h"
#include "incore.h"
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 2a5143246282..55f45e9b4930 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -159,7 +159,8 @@ static int hfs_writepages(struct address_space *mapping,
}
const struct address_space_operations hfs_btree_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = hfs_readpage,
.writepage = hfs_writepage,
.write_begin = hfs_write_begin,
@@ -169,7 +170,8 @@ const struct address_space_operations hfs_btree_aops = {
};
const struct address_space_operations hfs_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = hfs_readpage,
.writepage = hfs_writepage,
.write_begin = hfs_write_begin,
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 5beb82652435..8082eb01127c 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -9,7 +9,7 @@
*/
#include <linux/cdrom.h>
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
#include <linux/nls.h>
#include <linux/slab.h>
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 12d9bae39363..6764afa98a6f 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -162,7 +162,7 @@ static struct inode *hfs_alloc_inode(struct super_block *sb)
{
struct hfs_inode_info *i;
- i = kmem_cache_alloc(hfs_inode_cachep, GFP_KERNEL);
+ i = alloc_inode_sb(sb, hfs_inode_cachep, GFP_KERNEL);
return i ? &i->vfs_inode : NULL;
}
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index d08a8d1d40a4..446a816aa8e1 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -156,7 +156,8 @@ static int hfsplus_writepages(struct address_space *mapping,
}
const struct address_space_operations hfsplus_btree_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = hfsplus_readpage,
.writepage = hfsplus_writepage,
.write_begin = hfsplus_write_begin,
@@ -166,7 +167,8 @@ const struct address_space_operations hfsplus_btree_aops = {
};
const struct address_space_operations hfsplus_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = hfsplus_readpage,
.writepage = hfsplus_writepage,
.write_begin = hfsplus_write_begin,
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index b9e3db3f855f..8479add998b5 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -624,7 +624,7 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb)
{
struct hfsplus_inode_info *i;
- i = kmem_cache_alloc(hfsplus_inode_cachep, GFP_KERNEL);
+ i = alloc_inode_sb(sb, hfsplus_inode_cachep, GFP_KERNEL);
return i ? &i->vfs_inode : NULL;
}
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 51ae6f1eb4a5..0b8ad6586df5 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -12,7 +12,6 @@
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/cdrom.h>
-#include <linux/genhd.h>
#include <asm/unaligned.h>
#include "hfsplus_fs.h"
@@ -64,10 +63,8 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
offset = start & (io_size - 1);
sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1);
- bio = bio_alloc(GFP_NOIO, 1);
+ bio = bio_alloc(sb->s_bdev, 1, op | op_flags, GFP_NOIO);
bio->bi_iter.bi_sector = sector;
- bio_set_dev(bio, sb->s_bdev);
- bio_set_op_attrs(bio, op, op_flags);
if (op != WRITE && data)
*data = (u8 *)buf + offset;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index ef481c3d9019..14f9ac973a2e 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -14,6 +14,7 @@
#include <linux/statfs.h>
#include <linux/slab.h>
#include <linux/seq_file.h>
+#include <linux/writeback.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include "hostfs.h"
@@ -222,7 +223,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb)
{
struct hostfs_inode_info *hi;
- hi = kmem_cache_alloc(hostfs_inode_cache, GFP_KERNEL_ACCOUNT);
+ hi = alloc_inode_sb(sb, hostfs_inode_cache, GFP_KERNEL_ACCOUNT);
if (hi == NULL)
return NULL;
hi->fd = -1;
@@ -504,7 +505,7 @@ static int hostfs_write_end(struct file *file, struct address_space *mapping,
static const struct address_space_operations hostfs_aops = {
.writepage = hostfs_writepage,
.readpage = hostfs_readpage,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .dirty_folio = filemap_dirty_folio,
.write_begin = hostfs_write_begin,
.write_end = hostfs_write_end,
};
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index fb37f57130aa..99493a23c5d0 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -245,7 +245,8 @@ static int hpfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
}
const struct address_space_operations hpfs_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = hpfs_readpage,
.writepage = hpfs_writepage,
.readahead = hpfs_readahead,
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index a7dbfc892022..1cb89595b875 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -232,7 +232,7 @@ static struct kmem_cache * hpfs_inode_cachep;
static struct inode *hpfs_alloc_inode(struct super_block *sb)
{
struct hpfs_inode_info *ei;
- ei = kmem_cache_alloc(hpfs_inode_cachep, GFP_NOFS);
+ ei = alloc_inode_sb(sb, hpfs_inode_cachep, GFP_NOFS);
if (!ei)
return NULL;
return &ei->vfs_inode;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a7c6c7498be0..dd3a088db11d 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -206,7 +206,7 @@ hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr,
info.flags = 0;
info.length = len;
info.low_limit = current->mm->mmap_base;
- info.high_limit = TASK_SIZE;
+ info.high_limit = arch_get_mmap_end(addr);
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
return vm_unmapped_area(&info);
@@ -222,7 +222,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
info.low_limit = max(PAGE_SIZE, mmap_min_addr);
- info.high_limit = current->mm->mmap_base;
+ info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
addr = vm_unmapped_area(&info);
@@ -237,7 +237,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
VM_BUG_ON(addr != -ENOMEM);
info.flags = 0;
info.low_limit = current->mm->mmap_base;
- info.high_limit = TASK_SIZE;
+ info.high_limit = arch_get_mmap_end(addr);
addr = vm_unmapped_area(&info);
}
@@ -251,6 +251,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
struct hstate *h = hstate_file(file);
+ const unsigned long mmap_end = arch_get_mmap_end(addr);
if (len & ~huge_page_mask(h))
return -EINVAL;
@@ -266,7 +267,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
if (addr) {
addr = ALIGN(addr, huge_page_size(h));
vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr &&
+ if (mmap_end - len >= addr &&
(!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
@@ -1110,7 +1111,7 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
return NULL;
- p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL);
+ p = alloc_inode_sb(sb, hugetlbfs_inode_cachep, GFP_KERNEL);
if (unlikely(!p)) {
hugetlbfs_inc_free_inodes(sbinfo);
return NULL;
@@ -1144,7 +1145,7 @@ static void hugetlbfs_destroy_inode(struct inode *inode)
static const struct address_space_operations hugetlbfs_aops = {
.write_begin = hugetlbfs_write_begin,
.write_end = hugetlbfs_write_end,
- .set_page_dirty = __set_page_dirty_no_writeback,
+ .dirty_folio = noop_dirty_folio,
.migratepage = hugetlbfs_migrate_page,
.error_remove_page = hugetlbfs_error_remove_page,
};
diff --git a/fs/inode.c b/fs/inode.c
index 63324df6fa27..9d9b422504d1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -259,7 +259,7 @@ static struct inode *alloc_inode(struct super_block *sb)
if (ops->alloc_inode)
inode = ops->alloc_inode(sb);
else
- inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
+ inode = alloc_inode_sb(sb, inode_cachep, GFP_KERNEL);
if (!inode)
return NULL;
diff --git a/fs/internal.h b/fs/internal.h
index 711bdc00ec7c..08503dc68d2b 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -74,7 +74,7 @@ int do_linkat(int olddfd, struct filename *old, int newdfd,
* namespace.c
*/
extern struct vfsmount *lookup_mnt(const struct path *);
-extern int finish_automount(struct vfsmount *, struct path *);
+extern int finish_automount(struct vfsmount *, const struct path *);
extern int sb_prepare_remount_readonly(struct super_block *);
@@ -179,7 +179,9 @@ int sb_init_dio_done_wq(struct super_block *sb);
/*
* fs/stat.c:
*/
-int do_statx(int dfd, const char __user *filename, unsigned flags,
+
+int getname_statx_lookup_flags(int flags);
+int do_statx(int dfd, struct filename *filename, unsigned int flags,
unsigned int mask, struct statx __user *buffer);
/*
diff --git a/fs/io-wq.c b/fs/io-wq.c
index bb7f161bb19c..32aeb2c581c5 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -13,7 +13,7 @@
#include <linux/slab.h>
#include <linux/rculist_nulls.h>
#include <linux/cpu.h>
-#include <linux/tracehook.h>
+#include <linux/task_work.h>
#include <linux/audit.h>
#include <uapi/linux/io_uring.h>
@@ -76,6 +76,7 @@ struct io_wqe_acct {
unsigned max_workers;
int index;
atomic_t nr_running;
+ raw_spinlock_t lock;
struct io_wq_work_list work_list;
unsigned long flags;
};
@@ -91,7 +92,7 @@ enum {
*/
struct io_wqe {
raw_spinlock_t lock;
- struct io_wqe_acct acct[2];
+ struct io_wqe_acct acct[IO_WQ_ACCT_NR];
int node;
@@ -224,12 +225,12 @@ static void io_worker_exit(struct io_worker *worker)
if (worker->flags & IO_WORKER_F_FREE)
hlist_nulls_del_rcu(&worker->nulls_node);
list_del_rcu(&worker->all_list);
- preempt_disable();
+ raw_spin_unlock(&wqe->lock);
io_wqe_dec_running(worker);
worker->flags = 0;
+ preempt_disable();
current->flags &= ~PF_IO_WORKER;
preempt_enable();
- raw_spin_unlock(&wqe->lock);
kfree_rcu(worker, rcu);
io_worker_ref_put(wqe->wq);
@@ -238,10 +239,15 @@ static void io_worker_exit(struct io_worker *worker)
static inline bool io_acct_run_queue(struct io_wqe_acct *acct)
{
+ bool ret = false;
+
+ raw_spin_lock(&acct->lock);
if (!wq_list_empty(&acct->work_list) &&
!test_bit(IO_ACCT_STALLED_BIT, &acct->flags))
- return true;
- return false;
+ ret = true;
+ raw_spin_unlock(&acct->lock);
+
+ return ret;
}
/*
@@ -385,7 +391,6 @@ fail:
}
static void io_wqe_dec_running(struct io_worker *worker)
- __must_hold(wqe->lock)
{
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
struct io_wqe *wqe = worker->wqe;
@@ -393,13 +398,14 @@ static void io_wqe_dec_running(struct io_worker *worker)
if (!(worker->flags & IO_WORKER_F_UP))
return;
- if (atomic_dec_and_test(&acct->nr_running) && io_acct_run_queue(acct)) {
- atomic_inc(&acct->nr_running);
- atomic_inc(&wqe->wq->worker_refs);
- raw_spin_unlock(&wqe->lock);
- io_queue_worker_create(worker, acct, create_worker_cb);
- raw_spin_lock(&wqe->lock);
- }
+ if (!atomic_dec_and_test(&acct->nr_running))
+ return;
+ if (!io_acct_run_queue(acct))
+ return;
+
+ atomic_inc(&acct->nr_running);
+ atomic_inc(&wqe->wq->worker_refs);
+ io_queue_worker_create(worker, acct, create_worker_cb);
}
/*
@@ -407,11 +413,12 @@ static void io_wqe_dec_running(struct io_worker *worker)
* it's currently on the freelist
*/
static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker)
- __must_hold(wqe->lock)
{
if (worker->flags & IO_WORKER_F_FREE) {
worker->flags &= ~IO_WORKER_F_FREE;
+ raw_spin_lock(&wqe->lock);
hlist_nulls_del_init_rcu(&worker->nulls_node);
+ raw_spin_unlock(&wqe->lock);
}
}
@@ -456,7 +463,7 @@ static bool io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct,
struct io_worker *worker)
- __must_hold(wqe->lock)
+ __must_hold(acct->lock)
{
struct io_wq_work_node *node, *prev;
struct io_wq_work *work, *tail;
@@ -498,9 +505,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct,
* work being added and clearing the stalled bit.
*/
set_bit(IO_ACCT_STALLED_BIT, &acct->flags);
- raw_spin_unlock(&wqe->lock);
+ raw_spin_unlock(&acct->lock);
unstalled = io_wait_on_hash(wqe, stall_hash);
- raw_spin_lock(&wqe->lock);
+ raw_spin_lock(&acct->lock);
if (unstalled) {
clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
if (wq_has_sleeper(&wqe->wq->hash->wait))
@@ -515,7 +522,9 @@ static bool io_flush_signals(void)
{
if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) {
__set_current_state(TASK_RUNNING);
- tracehook_notify_signal();
+ clear_notify_signal();
+ if (task_work_pending(current))
+ task_work_run();
return true;
}
return false;
@@ -538,7 +547,6 @@ static void io_assign_current_work(struct io_worker *worker,
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
static void io_worker_handle_work(struct io_worker *worker)
- __releases(wqe->lock)
{
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
struct io_wqe *wqe = worker->wqe;
@@ -555,7 +563,9 @@ static void io_worker_handle_work(struct io_worker *worker)
* can't make progress, any work completion or insertion will
* clear the stalled flag.
*/
+ raw_spin_lock(&acct->lock);
work = io_get_next_work(acct, worker);
+ raw_spin_unlock(&acct->lock);
if (work) {
__io_worker_busy(wqe, worker);
@@ -569,10 +579,9 @@ static void io_worker_handle_work(struct io_worker *worker)
raw_spin_lock(&worker->lock);
worker->next_work = work;
raw_spin_unlock(&worker->lock);
- }
- raw_spin_unlock(&wqe->lock);
- if (!work)
+ } else {
break;
+ }
io_assign_current_work(worker, work);
__set_current_state(TASK_RUNNING);
@@ -608,8 +617,6 @@ static void io_worker_handle_work(struct io_worker *worker)
wake_up(&wq->hash->wait);
}
} while (work);
-
- raw_spin_lock(&wqe->lock);
} while (1);
}
@@ -633,12 +640,10 @@ static int io_wqe_worker(void *data)
long ret;
set_current_state(TASK_INTERRUPTIBLE);
-loop:
- raw_spin_lock(&wqe->lock);
- if (io_acct_run_queue(acct)) {
+ while (io_acct_run_queue(acct))
io_worker_handle_work(worker);
- goto loop;
- }
+
+ raw_spin_lock(&wqe->lock);
/* timed out, exit unless we're the last worker */
if (last_timeout && acct->nr_workers > 1) {
acct->nr_workers--;
@@ -662,10 +667,8 @@ loop:
last_timeout = !ret;
}
- if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
- raw_spin_lock(&wqe->lock);
+ if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
io_worker_handle_work(worker);
- }
audit_free(current);
io_worker_exit(worker);
@@ -705,10 +708,7 @@ void io_wq_worker_sleeping(struct task_struct *tsk)
return;
worker->flags &= ~IO_WORKER_F_RUNNING;
-
- raw_spin_lock(&worker->wqe->lock);
io_wqe_dec_running(worker);
- raw_spin_unlock(&worker->wqe->lock);
}
static void io_init_new_worker(struct io_wqe *wqe, struct io_worker *worker,
@@ -778,10 +778,12 @@ static void create_worker_cont(struct callback_head *cb)
.cancel_all = true,
};
+ raw_spin_unlock(&wqe->lock);
while (io_acct_cancel_pending_work(wqe, acct, &match))
- raw_spin_lock(&wqe->lock);
+ ;
+ } else {
+ raw_spin_unlock(&wqe->lock);
}
- raw_spin_unlock(&wqe->lock);
io_worker_ref_put(wqe->wq);
kfree(worker);
return;
@@ -914,6 +916,7 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data)
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
+ struct io_cb_cancel_data match;
unsigned work_flags = work->flags;
bool do_create;
@@ -927,10 +930,12 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
return;
}
- raw_spin_lock(&wqe->lock);
+ raw_spin_lock(&acct->lock);
io_wqe_insert_work(wqe, work);
clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
+ raw_spin_unlock(&acct->lock);
+ raw_spin_lock(&wqe->lock);
rcu_read_lock();
do_create = !io_wqe_activate_free_worker(wqe, acct);
rcu_read_unlock();
@@ -946,18 +951,18 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
return;
raw_spin_lock(&wqe->lock);
- /* fatal condition, failed to create the first worker */
- if (!acct->nr_workers) {
- struct io_cb_cancel_data match = {
- .fn = io_wq_work_match_item,
- .data = work,
- .cancel_all = false,
- };
-
- if (io_acct_cancel_pending_work(wqe, acct, &match))
- raw_spin_lock(&wqe->lock);
+ if (acct->nr_workers) {
+ raw_spin_unlock(&wqe->lock);
+ return;
}
raw_spin_unlock(&wqe->lock);
+
+ /* fatal condition, failed to create the first worker */
+ match.fn = io_wq_work_match_item,
+ match.data = work,
+ match.cancel_all = false,
+
+ io_acct_cancel_pending_work(wqe, acct, &match);
}
}
@@ -1032,22 +1037,23 @@ static inline void io_wqe_remove_pending(struct io_wqe *wqe,
static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
struct io_wqe_acct *acct,
struct io_cb_cancel_data *match)
- __releases(wqe->lock)
{
struct io_wq_work_node *node, *prev;
struct io_wq_work *work;
+ raw_spin_lock(&acct->lock);
wq_list_for_each(node, prev, &acct->work_list) {
work = container_of(node, struct io_wq_work, list);
if (!match->fn(work, match->data))
continue;
io_wqe_remove_pending(wqe, work, prev);
- raw_spin_unlock(&wqe->lock);
+ raw_spin_unlock(&acct->lock);
io_run_cancel(work, wqe);
match->nr_pending++;
/* not safe to continue after unlock */
return true;
}
+ raw_spin_unlock(&acct->lock);
return false;
}
@@ -1061,7 +1067,6 @@ retry:
struct io_wqe_acct *acct = io_get_acct(wqe, i == 0);
if (io_acct_cancel_pending_work(wqe, acct, match)) {
- raw_spin_lock(&wqe->lock);
if (match->cancel_all)
goto retry;
break;
@@ -1103,13 +1108,11 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
- raw_spin_lock(&wqe->lock);
io_wqe_cancel_pending_work(wqe, &match);
- if (match.nr_pending && !match.cancel_all) {
- raw_spin_unlock(&wqe->lock);
+ if (match.nr_pending && !match.cancel_all)
return IO_WQ_CANCEL_OK;
- }
+ raw_spin_lock(&wqe->lock);
io_wqe_cancel_running_work(wqe, &match);
raw_spin_unlock(&wqe->lock);
if (match.nr_running && !match.cancel_all)
@@ -1190,6 +1193,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
acct->index = i;
atomic_set(&acct->nr_running, 0);
INIT_WQ_LIST(&acct->work_list);
+ raw_spin_lock_init(&acct->lock);
}
wqe->wq = wq;
raw_spin_lock_init(&wqe->lock);
@@ -1282,9 +1286,7 @@ static void io_wq_destroy(struct io_wq *wq)
.fn = io_wq_work_match_all,
.cancel_all = true,
};
- raw_spin_lock(&wqe->lock);
io_wqe_cancel_pending_work(wqe, &match);
- raw_spin_unlock(&wqe->lock);
free_cpumask_var(wqe->cpu_mask);
kfree(wqe);
}
@@ -1376,7 +1378,7 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count)
BUILD_BUG_ON((int) IO_WQ_ACCT_UNBOUND != (int) IO_WQ_UNBOUND);
BUILD_BUG_ON((int) IO_WQ_ACCT_NR != 2);
- for (i = 0; i < 2; i++) {
+ for (i = 0; i < IO_WQ_ACCT_NR; i++) {
if (new_count[i] > task_rlimit(current, RLIMIT_NPROC))
new_count[i] = task_rlimit(current, RLIMIT_NPROC);
}
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4715980e9015..e01f595f5b7d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -78,7 +78,6 @@
#include <linux/task_work.h>
#include <linux/pagemap.h>
#include <linux/io_uring.h>
-#include <linux/tracehook.h>
#include <linux/audit.h>
#include <linux/security.h>
@@ -112,8 +111,7 @@
IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS)
#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
- REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
- REQ_F_ASYNC_DATA)
+ REQ_F_POLLED | REQ_F_CREDS | REQ_F_ASYNC_DATA)
#define IO_TCTX_REFS_CACHE_NR (1U << 10)
@@ -263,11 +261,18 @@ struct io_rsrc_data {
bool quiesce;
};
+struct io_buffer_list {
+ struct list_head list;
+ struct list_head buf_list;
+ __u16 bgid;
+};
+
struct io_buffer {
struct list_head list;
__u64 addr;
__u32 len;
__u16 bid;
+ __u16 bgid;
};
struct io_restriction {
@@ -326,6 +331,14 @@ struct io_submit_state {
struct blk_plug plug;
};
+struct io_ev_fd {
+ struct eventfd_ctx *cq_ev_fd;
+ unsigned int eventfd_async: 1;
+ struct rcu_head rcu;
+};
+
+#define IO_BUFFERS_HASH_BITS 5
+
struct io_ring_ctx {
/* const or read-mostly hot data */
struct {
@@ -335,11 +348,11 @@ struct io_ring_ctx {
unsigned int flags;
unsigned int compat: 1;
unsigned int drain_next: 1;
- unsigned int eventfd_async: 1;
unsigned int restricted: 1;
unsigned int off_timeout_used: 1;
unsigned int drain_active: 1;
unsigned int drain_disabled: 1;
+ unsigned int has_evfd: 1;
} ____cacheline_aligned_in_smp;
/* submission data */
@@ -378,7 +391,9 @@ struct io_ring_ctx {
struct list_head timeout_list;
struct list_head ltimeout_list;
struct list_head cq_overflow_list;
- struct xarray io_buffers;
+ struct list_head *io_buffers;
+ struct list_head io_buffers_cache;
+ struct list_head apoll_cache;
struct xarray personalities;
u32 pers_next;
unsigned sq_thread_idle;
@@ -399,7 +414,7 @@ struct io_ring_ctx {
struct {
unsigned cached_cq_tail;
unsigned cq_entries;
- struct eventfd_ctx *cq_ev_fd;
+ struct io_ev_fd __rcu *io_ev_fd;
struct wait_queue_head cq_wait;
unsigned cq_extra;
atomic_t cq_timeouts;
@@ -421,6 +436,8 @@ struct io_ring_ctx {
struct hlist_head *cancel_hash;
unsigned cancel_hash_bits;
bool poll_multi_queue;
+
+ struct list_head io_buffers_comp;
} ____cacheline_aligned_in_smp;
struct io_restriction restrictions;
@@ -436,6 +453,8 @@ struct io_ring_ctx {
struct llist_head rsrc_put_llist;
struct list_head rsrc_ref_list;
spinlock_t rsrc_ref_lock;
+
+ struct list_head io_buffers_pages;
};
/* Keep this last, we don't need it for the fast path */
@@ -461,6 +480,11 @@ struct io_ring_ctx {
};
};
+/*
+ * Arbitrary limit, can be raised if need be
+ */
+#define IO_RINGFD_REG_MAX 16
+
struct io_uring_task {
/* submission side */
int cached_refs;
@@ -469,13 +493,13 @@ struct io_uring_task {
const struct io_ring_ctx *last;
struct io_wq *io_wq;
struct percpu_counter inflight;
- atomic_t inflight_tracked;
atomic_t in_idle;
spinlock_t task_lock;
struct io_wq_work_list task_list;
struct io_wq_work_list prior_task_list;
struct callback_head task_work;
+ struct file **registered_rings;
bool task_running;
};
@@ -560,7 +584,8 @@ struct io_rw {
/* NOTE: kiocb has the file as the first member, so don't do it here */
struct kiocb kiocb;
u64 addr;
- u64 len;
+ u32 len;
+ u32 flags;
};
struct io_connect {
@@ -579,6 +604,7 @@ struct io_sr_msg {
int msg_flags;
int bgid;
size_t len;
+ size_t done_io;
};
struct io_open {
@@ -621,10 +647,10 @@ struct io_epoll {
struct io_splice {
struct file *file_out;
- struct file *file_in;
loff_t off_out;
loff_t off_in;
u64 len;
+ int splice_fd_in;
unsigned int flags;
};
@@ -642,7 +668,7 @@ struct io_statx {
int dfd;
unsigned int mask;
unsigned int flags;
- const char __user *filename;
+ struct filename *filename;
struct statx __user *buffer;
};
@@ -690,6 +716,12 @@ struct io_hardlink {
int flags;
};
+struct io_msg {
+ struct file *file;
+ u64 user_data;
+ u32 len;
+};
+
struct io_async_connect {
struct sockaddr_storage address;
};
@@ -741,6 +773,9 @@ enum {
REQ_F_ARM_LTIMEOUT_BIT,
REQ_F_ASYNC_DATA_BIT,
REQ_F_SKIP_LINK_CQES_BIT,
+ REQ_F_SINGLE_POLL_BIT,
+ REQ_F_DOUBLE_POLL_BIT,
+ REQ_F_PARTIAL_IO_BIT,
/* keep async read/write and isreg together and in order */
REQ_F_SUPPORT_NOWAIT_BIT,
REQ_F_ISREG_BIT,
@@ -799,6 +834,12 @@ enum {
REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT),
/* don't post CQEs while failing linked requests */
REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT),
+ /* single poll may be active */
+ REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT),
+ /* double poll may active */
+ REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT),
+ /* request has already done partial IO */
+ REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT),
};
struct async_poll {
@@ -825,7 +866,7 @@ enum {
* NOTE! Each of the iocb union members has the file pointer
* as the first entry in their struct definition. So you can
* access the file pointer through any of the sub-structs,
- * or directly as just 'ki_filp' in this struct.
+ * or directly as just 'file' in this struct.
*/
struct io_kiocb {
union {
@@ -855,6 +896,7 @@ struct io_kiocb {
struct io_mkdir mkdir;
struct io_symlink symlink;
struct io_hardlink hardlink;
+ struct io_msg msg;
};
u8 opcode;
@@ -865,7 +907,11 @@ struct io_kiocb {
u64 user_data;
u32 result;
- u32 cflags;
+ /* fd initially, then cflags for completion */
+ union {
+ u32 cflags;
+ int fd;
+ };
struct io_ring_ctx *ctx;
struct task_struct *task;
@@ -874,10 +920,14 @@ struct io_kiocb {
/* store used ubuf, so we can prevent reloading */
struct io_mapped_ubuf *imu;
- /* used by request caches, completion batching and iopoll */
- struct io_wq_work_node comp_list;
+ union {
+ /* used by request caches, completion batching and iopoll */
+ struct io_wq_work_node comp_list;
+ /* cache ->apoll->events */
+ int apoll_events;
+ };
atomic_t refs;
- struct io_kiocb *link;
+ atomic_t poll_refs;
struct io_task_work io_task_work;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
struct hlist_node hash_node;
@@ -885,12 +935,13 @@ struct io_kiocb {
struct async_poll *apoll;
/* opcode allocated if it needs to store data for async defer */
void *async_data;
- struct io_wq_work work;
- /* custom credentials, valid IFF REQ_F_CREDS is set */
- const struct cred *creds;
/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
struct io_buffer *kbuf;
- atomic_t poll_refs;
+ /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
+ struct io_kiocb *link;
+ /* custom credentials, valid IFF REQ_F_CREDS is set */
+ const struct cred *creds;
+ struct io_wq_work work;
};
struct io_tctx_node {
@@ -917,6 +968,7 @@ struct io_op_def {
/* set if opcode supports polled "wait" */
unsigned pollin : 1;
unsigned pollout : 1;
+ unsigned poll_exclusive : 1;
/* op supports buffer selection */
unsigned buffer_select : 1;
/* do prep async if is going to be punted */
@@ -1011,6 +1063,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
+ .poll_exclusive = 1,
},
[IORING_OP_ASYNC_CANCEL] = {
.audit_skip = 1,
@@ -1105,6 +1158,9 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_MKDIRAT] = {},
[IORING_OP_SYMLINKAT] = {},
[IORING_OP_LINKAT] = {},
+ [IORING_OP_MSG_RING] = {
+ .needs_file = 1,
+ },
};
/* requests with any of those set should undergo io_disarm_next() */
@@ -1127,8 +1183,11 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
struct io_uring_rsrc_update2 *up,
unsigned nr_args);
static void io_clean_op(struct io_kiocb *req);
-static struct file *io_file_get(struct io_ring_ctx *ctx,
- struct io_kiocb *req, int fd, bool fixed);
+static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
+ unsigned issue_flags);
+static inline struct file *io_file_get_normal(struct io_kiocb *req, int fd);
+static void io_drop_inflight_file(struct io_kiocb *req);
+static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags);
static void __io_queue_sqe(struct io_kiocb *req);
static void io_rsrc_put_work(struct work_struct *work);
@@ -1141,6 +1200,7 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
+static void io_eventfd_signal(struct io_ring_ctx *ctx);
static struct kmem_cache *req_cachep;
@@ -1257,75 +1317,127 @@ static void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
}
static inline void io_req_set_rsrc_node(struct io_kiocb *req,
- struct io_ring_ctx *ctx)
+ struct io_ring_ctx *ctx,
+ unsigned int issue_flags)
{
if (!req->fixed_rsrc_refs) {
req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
- ctx->rsrc_cached_refs--;
- if (unlikely(ctx->rsrc_cached_refs < 0))
- io_rsrc_refs_refill(ctx);
+
+ if (!(issue_flags & IO_URING_F_UNLOCKED)) {
+ lockdep_assert_held(&ctx->uring_lock);
+ ctx->rsrc_cached_refs--;
+ if (unlikely(ctx->rsrc_cached_refs < 0))
+ io_rsrc_refs_refill(ctx);
+ } else {
+ percpu_ref_get(req->fixed_rsrc_refs);
+ }
}
}
-static unsigned int __io_put_kbuf(struct io_kiocb *req)
+static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list)
{
struct io_buffer *kbuf = req->kbuf;
unsigned int cflags;
- cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
- cflags |= IORING_CQE_F_BUFFER;
+ cflags = IORING_CQE_F_BUFFER | (kbuf->bid << IORING_CQE_BUFFER_SHIFT);
req->flags &= ~REQ_F_BUFFER_SELECTED;
- kfree(kbuf);
+ list_add(&kbuf->list, list);
req->kbuf = NULL;
return cflags;
}
-static inline unsigned int io_put_kbuf(struct io_kiocb *req)
+static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
{
+ lockdep_assert_held(&req->ctx->completion_lock);
+
if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
return 0;
- return __io_put_kbuf(req);
+ return __io_put_kbuf(req, &req->ctx->io_buffers_comp);
}
-static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl)
+static inline unsigned int io_put_kbuf(struct io_kiocb *req,
+ unsigned issue_flags)
{
- bool got = percpu_ref_tryget(ref);
+ unsigned int cflags;
+
+ if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+ return 0;
- /* already at zero, wait for ->release() */
- if (!got)
- wait_for_completion(compl);
- percpu_ref_resurrect(ref);
- if (got)
- percpu_ref_put(ref);
+ /*
+ * We can add this buffer back to two lists:
+ *
+ * 1) The io_buffers_cache list. This one is protected by the
+ * ctx->uring_lock. If we already hold this lock, add back to this
+ * list as we can grab it from issue as well.
+ * 2) The io_buffers_comp list. This one is protected by the
+ * ctx->completion_lock.
+ *
+ * We migrate buffers from the comp_list to the issue cache list
+ * when we need one.
+ */
+ if (issue_flags & IO_URING_F_UNLOCKED) {
+ struct io_ring_ctx *ctx = req->ctx;
+
+ spin_lock(&ctx->completion_lock);
+ cflags = __io_put_kbuf(req, &ctx->io_buffers_comp);
+ spin_unlock(&ctx->completion_lock);
+ } else {
+ lockdep_assert_held(&req->ctx->uring_lock);
+
+ cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache);
+ }
+
+ return cflags;
}
-static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
- bool cancel_all)
- __must_hold(&req->ctx->timeout_lock)
+static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
+ unsigned int bgid)
{
- struct io_kiocb *req;
+ struct list_head *hash_list;
+ struct io_buffer_list *bl;
- if (task && head->task != task)
- return false;
- if (cancel_all)
- return true;
+ hash_list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)];
+ list_for_each_entry(bl, hash_list, list)
+ if (bl->bgid == bgid || bgid == -1U)
+ return bl;
- io_for_each_link(req, head) {
- if (req->flags & REQ_F_INFLIGHT)
- return true;
- }
- return false;
+ return NULL;
}
-static bool io_match_linked(struct io_kiocb *head)
+static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
{
- struct io_kiocb *req;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_buffer_list *bl;
+ struct io_buffer *buf;
- io_for_each_link(req, head) {
- if (req->flags & REQ_F_INFLIGHT)
- return true;
- }
- return false;
+ if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+ return;
+ /* don't recycle if we already did IO to this buffer */
+ if (req->flags & REQ_F_PARTIAL_IO)
+ return;
+
+ if (issue_flags & IO_URING_F_UNLOCKED)
+ mutex_lock(&ctx->uring_lock);
+
+ lockdep_assert_held(&ctx->uring_lock);
+
+ buf = req->kbuf;
+ bl = io_buffer_get_list(ctx, buf->bgid);
+ list_add(&buf->list, &bl->buf_list);
+ req->flags &= ~REQ_F_BUFFER_SELECTED;
+ req->kbuf = NULL;
+
+ if (issue_flags & IO_URING_F_UNLOCKED)
+ mutex_unlock(&ctx->uring_lock);
+}
+
+static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
+ bool cancel_all)
+ __must_hold(&req->ctx->timeout_lock)
+{
+ if (task && head->task != task)
+ return false;
+ return cancel_all;
}
/*
@@ -1335,24 +1447,9 @@ static bool io_match_linked(struct io_kiocb *head)
static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
bool cancel_all)
{
- bool matched;
-
if (task && head->task != task)
return false;
- if (cancel_all)
- return true;
-
- if (head->flags & REQ_F_LINK_TIMEOUT) {
- struct io_ring_ctx *ctx = head->ctx;
-
- /* protect against races with linked timeouts */
- spin_lock_irq(&ctx->timeout_lock);
- matched = io_match_linked(head);
- spin_unlock_irq(&ctx->timeout_lock);
- } else {
- matched = io_match_linked(head);
- }
- return matched;
+ return cancel_all;
}
static inline bool req_has_async_data(struct io_kiocb *req)
@@ -1409,7 +1506,7 @@ static __cold void io_fallback_req_func(struct work_struct *work)
static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{
struct io_ring_ctx *ctx;
- int hash_bits;
+ int i, hash_bits;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
@@ -1436,6 +1533,13 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
/* set invalid range, so io_import_fixed() fails meeting it */
ctx->dummy_ubuf->ubuf = -1UL;
+ ctx->io_buffers = kcalloc(1U << IO_BUFFERS_HASH_BITS,
+ sizeof(struct list_head), GFP_KERNEL);
+ if (!ctx->io_buffers)
+ goto err;
+ for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++)
+ INIT_LIST_HEAD(&ctx->io_buffers[i]);
+
if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
goto err;
@@ -1444,14 +1548,17 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
init_waitqueue_head(&ctx->sqo_sq_wait);
INIT_LIST_HEAD(&ctx->sqd_list);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
+ INIT_LIST_HEAD(&ctx->io_buffers_cache);
+ INIT_LIST_HEAD(&ctx->apoll_cache);
init_completion(&ctx->ref_comp);
- xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
mutex_init(&ctx->uring_lock);
init_waitqueue_head(&ctx->cq_wait);
spin_lock_init(&ctx->completion_lock);
spin_lock_init(&ctx->timeout_lock);
INIT_WQ_LIST(&ctx->iopoll_list);
+ INIT_LIST_HEAD(&ctx->io_buffers_pages);
+ INIT_LIST_HEAD(&ctx->io_buffers_comp);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
INIT_LIST_HEAD(&ctx->ltimeout_list);
@@ -1468,6 +1575,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
err:
kfree(ctx->dummy_ubuf);
kfree(ctx->cancel_hash);
+ kfree(ctx->io_buffers);
kfree(ctx);
return NULL;
}
@@ -1500,14 +1608,6 @@ static inline bool io_req_ffs_set(struct io_kiocb *req)
return req->flags & REQ_F_FIXED_FILE;
}
-static inline void io_req_track_inflight(struct io_kiocb *req)
-{
- if (!(req->flags & REQ_F_INFLIGHT)) {
- req->flags |= REQ_F_INFLIGHT;
- atomic_inc(&current->io_uring->inflight_tracked);
- }
-}
-
static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
{
if (WARN_ON_ONCE(!req->link))
@@ -1551,14 +1651,6 @@ static void io_prep_async_work(struct io_kiocb *req)
if (def->unbound_nonreg_file)
req->work.flags |= IO_WQ_WORK_UNBOUND;
}
-
- switch (req->opcode) {
- case IORING_OP_SPLICE:
- case IORING_OP_TEE:
- if (!S_ISREG(file_inode(req->splice.file_in)->i_mode))
- req->work.flags |= IO_WQ_WORK_UNBOUND;
- break;
- }
}
static void io_prep_async_link(struct io_kiocb *req)
@@ -1610,8 +1702,8 @@ static void io_queue_async_work(struct io_kiocb *req, bool *dont_use)
if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
req->work.flags |= IO_WQ_WORK_CANCEL;
- trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
- &req->work, req->flags);
+ trace_io_uring_queue_async_work(ctx, req, req->user_data, req->opcode, req->flags,
+ &req->work, io_wq_is_hashed(&req->work));
io_wq_enqueue(tctx->io_wq, &req->work);
if (link)
io_queue_linked_timeout(link);
@@ -1652,12 +1744,11 @@ static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
__must_hold(&ctx->completion_lock)
{
u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
+ struct io_kiocb *req, *tmp;
spin_lock_irq(&ctx->timeout_lock);
- while (!list_empty(&ctx->timeout_list)) {
+ list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
u32 events_needed, events_got;
- struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
- struct io_kiocb, timeout.list);
if (io_is_timeout_noseq(req))
break;
@@ -1674,29 +1765,33 @@ static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
if (events_got < events_needed)
break;
- list_del_init(&req->timeout.list);
io_kill_timeout(req, 0);
}
ctx->cq_last_tm_flush = seq;
spin_unlock_irq(&ctx->timeout_lock);
}
-static __cold void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
-{
- if (ctx->off_timeout_used)
- io_flush_timeouts(ctx);
- if (ctx->drain_active)
- io_queue_deferred(ctx);
-}
-
static inline void io_commit_cqring(struct io_ring_ctx *ctx)
{
- if (unlikely(ctx->off_timeout_used || ctx->drain_active))
- __io_commit_cqring_flush(ctx);
/* order cqe stores with ring update */
smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
}
+static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
+{
+ if (ctx->off_timeout_used || ctx->drain_active) {
+ spin_lock(&ctx->completion_lock);
+ if (ctx->off_timeout_used)
+ io_flush_timeouts(ctx);
+ if (ctx->drain_active)
+ io_queue_deferred(ctx);
+ io_commit_cqring(ctx);
+ spin_unlock(&ctx->completion_lock);
+ }
+ if (ctx->has_evfd)
+ io_eventfd_signal(ctx);
+}
+
static inline bool io_sqring_full(struct io_ring_ctx *ctx)
{
struct io_rings *r = ctx->rings;
@@ -1726,23 +1821,34 @@ static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
return &rings->cqes[tail & mask];
}
-static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
+static void io_eventfd_signal(struct io_ring_ctx *ctx)
{
- if (likely(!ctx->cq_ev_fd))
- return false;
+ struct io_ev_fd *ev_fd;
+
+ rcu_read_lock();
+ /*
+ * rcu_dereference ctx->io_ev_fd once and use it for both for checking
+ * and eventfd_signal
+ */
+ ev_fd = rcu_dereference(ctx->io_ev_fd);
+
+ /*
+ * Check again if ev_fd exists incase an io_eventfd_unregister call
+ * completed between the NULL check of ctx->io_ev_fd at the start of
+ * the function and rcu_read_lock.
+ */
+ if (unlikely(!ev_fd))
+ goto out;
if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
- return false;
- return !ctx->eventfd_async || io_wq_current_is_worker();
+ goto out;
+
+ if (!ev_fd->eventfd_async || io_wq_current_is_worker())
+ eventfd_signal(ev_fd->cq_ev_fd, 1);
+out:
+ rcu_read_unlock();
}
-/*
- * This should only get called when at least one event has been posted.
- * Some applications rely on the eventfd notification count only changing
- * IFF a new CQE has been added to the CQ ring. There's no depedency on
- * 1:1 relationship between how many times this function is called (and
- * hence the eventfd count) and number of CQEs posted to the CQ ring.
- */
-static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+static inline void io_cqring_wake(struct io_ring_ctx *ctx)
{
/*
* wake_up_all() may seem excessive, but io_wake_function() and
@@ -1751,21 +1857,32 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
*/
if (wq_has_sleeper(&ctx->cq_wait))
wake_up_all(&ctx->cq_wait);
- if (io_should_trigger_evfd(ctx))
- eventfd_signal(ctx->cq_ev_fd, 1);
+}
+
+/*
+ * This should only get called when at least one event has been posted.
+ * Some applications rely on the eventfd notification count only changing
+ * IFF a new CQE has been added to the CQ ring. There's no depedency on
+ * 1:1 relationship between how many times this function is called (and
+ * hence the eventfd count) and number of CQEs posted to the CQ ring.
+ */
+static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+{
+ if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
+ ctx->has_evfd))
+ __io_commit_cqring_flush(ctx);
+
+ io_cqring_wake(ctx);
}
static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
{
- /* see waitqueue_active() comment */
- smp_mb();
+ if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
+ ctx->has_evfd))
+ __io_commit_cqring_flush(ctx);
- if (ctx->flags & IORING_SETUP_SQPOLL) {
- if (waitqueue_active(&ctx->cq_wait))
- wake_up_all(&ctx->cq_wait);
- }
- if (io_should_trigger_evfd(ctx))
- eventfd_signal(ctx->cq_ev_fd, 1);
+ if (ctx->flags & IORING_SETUP_SQPOLL)
+ io_cqring_wake(ctx);
}
/* Returns true if there are no backlogged entries after the flush */
@@ -1905,8 +2022,6 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
{
struct io_uring_cqe *cqe;
- trace_io_uring_complete(ctx, user_data, res, cflags);
-
/*
* If we can't get a cq entry, userspace overflowed the
* submission (by quite a lot). Increment the overflow count in
@@ -1922,16 +2037,23 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
return io_cqring_event_overflow(ctx, user_data, res, cflags);
}
+static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
+{
+ trace_io_uring_complete(req->ctx, req, req->user_data, res, cflags);
+ return __io_fill_cqe(req->ctx, req->user_data, res, cflags);
+}
+
static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
{
if (!(req->flags & REQ_F_CQE_SKIP))
- __io_fill_cqe(req->ctx, req->user_data, res, cflags);
+ __io_fill_cqe_req(req, res, cflags);
}
static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
s32 res, u32 cflags)
{
ctx->cq_extra++;
+ trace_io_uring_complete(ctx, NULL, user_data, res, cflags);
return __io_fill_cqe(ctx, user_data, res, cflags);
}
@@ -1941,7 +2063,7 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res,
struct io_ring_ctx *ctx = req->ctx;
if (!(req->flags & REQ_F_CQE_SKIP))
- __io_fill_cqe(ctx, req->user_data, res, cflags);
+ __io_fill_cqe_req(req, res, cflags);
/*
* If we're the last reference to this request, add to our locked
* free_list cache.
@@ -1956,6 +2078,12 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res,
}
}
io_req_put_rsrc(req, ctx);
+ /*
+ * Selected buffer deallocation in io_clean_op() assumes that
+ * we don't hold ->completion_lock. Clean them here to avoid
+ * deadlocks.
+ */
+ io_put_kbuf_comp(req);
io_dismantle_req(req);
io_put_task(req->task, 1);
wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
@@ -2000,7 +2128,7 @@ static inline void io_req_complete(struct io_kiocb *req, s32 res)
static void io_req_complete_failed(struct io_kiocb *req, s32 res)
{
req_set_fail(req);
- io_req_complete_post(req, res, 0);
+ io_req_complete_post(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
}
static void io_req_complete_fail_submit(struct io_kiocb *req)
@@ -2183,7 +2311,9 @@ static void io_fail_links(struct io_kiocb *req)
nxt = link->link;
link->link = NULL;
- trace_io_uring_fail_link(req, link);
+ trace_io_uring_fail_link(req->ctx, req, req->user_data,
+ req->opcode, link);
+
if (!ignore_cqes) {
link->flags &= ~REQ_F_CQE_SKIP;
io_fill_cqe_req(link, res, 0);
@@ -2287,6 +2417,8 @@ static void handle_prev_tw_list(struct io_wq_work_node *node,
struct io_kiocb *req = container_of(node, struct io_kiocb,
io_task_work.node);
+ prefetch(container_of(next, struct io_kiocb, io_task_work.node));
+
if (req->ctx != *ctx) {
if (unlikely(!*uring_locked && *ctx))
ctx_commit_and_unlock(*ctx);
@@ -2302,7 +2434,8 @@ static void handle_prev_tw_list(struct io_wq_work_node *node,
if (likely(*uring_locked))
req->io_task_work.func(req, uring_locked);
else
- __io_req_complete_post(req, req->result, io_put_kbuf(req));
+ __io_req_complete_post(req, req->result,
+ io_put_kbuf_comp(req));
node = next;
} while (node);
@@ -2318,6 +2451,8 @@ static void handle_tw_list(struct io_wq_work_node *node,
struct io_kiocb *req = container_of(node, struct io_kiocb,
io_task_work.node);
+ prefetch(container_of(next, struct io_kiocb, io_task_work.node));
+
if (req->ctx != *ctx) {
ctx_flush_and_put(*ctx, locked);
*ctx = req->ctx;
@@ -2381,6 +2516,8 @@ static void io_req_task_work_add(struct io_kiocb *req, bool priority)
WARN_ON_ONCE(!tctx);
+ io_drop_inflight_file(req);
+
spin_lock_irqsave(&tctx->task_lock, flags);
if (priority)
wq_list_add_tail(&req->io_task_work.node, &tctx->prior_task_list);
@@ -2530,8 +2667,16 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
comp_list);
if (!(req->flags & REQ_F_CQE_SKIP))
- __io_fill_cqe(ctx, req->user_data, req->result,
- req->cflags);
+ __io_fill_cqe_req(req, req->result, req->cflags);
+ if ((req->flags & REQ_F_POLLED) && req->apoll) {
+ struct async_poll *apoll = req->apoll;
+
+ if (apoll->double_poll)
+ kfree(apoll->double_poll);
+ list_add(&apoll->poll.wait.entry,
+ &ctx->apoll_cache);
+ req->flags &= ~REQ_F_POLLED;
+ }
}
io_commit_cqring(ctx);
@@ -2590,9 +2735,11 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
static inline bool io_run_task_work(void)
{
- if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) {
+ if (test_thread_flag(TIF_NOTIFY_SIGNAL) || task_work_pending(current)) {
__set_current_state(TASK_RUNNING);
- tracehook_notify_signal();
+ clear_notify_signal();
+ if (task_work_pending(current))
+ task_work_run();
return true;
}
@@ -2650,11 +2797,10 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
/* order with io_complete_rw_iopoll(), e.g. ->result updates */
if (!smp_load_acquire(&req->iopoll_completed))
break;
+ nr_events++;
if (unlikely(req->flags & REQ_F_CQE_SKIP))
continue;
-
- __io_fill_cqe(ctx, req->user_data, req->result, io_put_kbuf(req));
- nr_events++;
+ __io_fill_cqe_req(req, req->result, io_put_kbuf(req, 0));
}
if (unlikely(!nr_events))
@@ -2813,8 +2959,12 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
static bool __io_complete_rw_common(struct io_kiocb *req, long res)
{
- if (req->rw.kiocb.ki_flags & IOCB_WRITE)
+ if (req->rw.kiocb.ki_flags & IOCB_WRITE) {
kiocb_end_write(req);
+ fsnotify_modify(req->file);
+ } else {
+ fsnotify_access(req->file);
+ }
if (unlikely(res != req->result)) {
if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
io_rw_should_reissue(req)) {
@@ -2829,14 +2979,14 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
static inline void io_req_task_complete(struct io_kiocb *req, bool *locked)
{
- unsigned int cflags = io_put_kbuf(req);
int res = req->result;
if (*locked) {
- io_req_complete_state(req, res, cflags);
+ io_req_complete_state(req, res, io_put_kbuf(req, 0));
io_req_add_compl_list(req);
} else {
- io_req_complete_post(req, res, cflags);
+ io_req_complete_post(req, res,
+ io_put_kbuf(req, IO_URING_F_UNLOCKED));
}
}
@@ -2845,7 +2995,8 @@ static void __io_complete_rw(struct io_kiocb *req, long res,
{
if (__io_complete_rw_common(req, res))
return;
- __io_req_complete(req, issue_flags, req->result, io_put_kbuf(req));
+ __io_req_complete(req, issue_flags, req->result,
+ io_put_kbuf(req, issue_flags));
}
static void io_complete_rw(struct kiocb *kiocb, long res)
@@ -2990,50 +3141,11 @@ static inline bool io_file_supports_nowait(struct io_kiocb *req)
static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- struct io_ring_ctx *ctx = req->ctx;
struct kiocb *kiocb = &req->rw.kiocb;
- struct file *file = req->file;
unsigned ioprio;
int ret;
- if (!io_req_ffs_set(req))
- req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
-
kiocb->ki_pos = READ_ONCE(sqe->off);
- if (kiocb->ki_pos == -1) {
- if (!(file->f_mode & FMODE_STREAM)) {
- req->flags |= REQ_F_CUR_POS;
- kiocb->ki_pos = file->f_pos;
- } else {
- kiocb->ki_pos = 0;
- }
- }
- kiocb->ki_flags = iocb_flags(file);
- ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
- if (unlikely(ret))
- return ret;
-
- /*
- * If the file is marked O_NONBLOCK, still allow retry for it if it
- * supports async. Otherwise it's impossible to use O_NONBLOCK files
- * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
- */
- if ((kiocb->ki_flags & IOCB_NOWAIT) ||
- ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req)))
- req->flags |= REQ_F_NOWAIT;
-
- if (ctx->flags & IORING_SETUP_IOPOLL) {
- if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
- return -EOPNOTSUPP;
-
- kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
- kiocb->ki_complete = io_complete_rw_iopoll;
- req->iopoll_completed = 0;
- } else {
- if (kiocb->ki_flags & IOCB_HIPRI)
- return -EINVAL;
- kiocb->ki_complete = io_complete_rw;
- }
ioprio = READ_ONCE(sqe->ioprio);
if (ioprio) {
@@ -3049,6 +3161,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
req->imu = NULL;
req->rw.addr = READ_ONCE(sqe->addr);
req->rw.len = READ_ONCE(sqe->len);
+ req->rw.flags = READ_ONCE(sqe->rw_flags);
req->buf_index = READ_ONCE(sqe->buf_index);
return 0;
}
@@ -3074,6 +3187,23 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
}
}
+static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
+{
+ struct kiocb *kiocb = &req->rw.kiocb;
+
+ if (kiocb->ki_pos != -1)
+ return &kiocb->ki_pos;
+
+ if (!(req->file->f_mode & FMODE_STREAM)) {
+ req->flags |= REQ_F_CUR_POS;
+ kiocb->ki_pos = req->file->f_pos;
+ return &kiocb->ki_pos;
+ }
+
+ kiocb->ki_pos = 0;
+ return NULL;
+}
+
static void kiocb_done(struct io_kiocb *req, ssize_t ret,
unsigned int issue_flags)
{
@@ -3096,14 +3226,10 @@ static void kiocb_done(struct io_kiocb *req, ssize_t ret,
if (req->flags & REQ_F_REISSUE) {
req->flags &= ~REQ_F_REISSUE;
- if (io_resubmit_prep(req)) {
+ if (io_resubmit_prep(req))
io_req_task_queue_reissue(req);
- } else {
- req_set_fail(req);
- req->result = ret;
- req->io_task_work.func = io_req_task_complete;
- io_req_task_work_add(req, false);
- }
+ else
+ io_req_task_queue_fail(req, ret);
}
}
@@ -3165,7 +3291,8 @@ static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter
return 0;
}
-static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
+static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
+ unsigned int issue_flags)
{
struct io_mapped_ubuf *imu = req->imu;
u16 index, buf_index = req->buf_index;
@@ -3175,7 +3302,7 @@ static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
if (unlikely(buf_index >= ctx->nr_user_bufs))
return -EFAULT;
- io_req_set_rsrc_node(req, ctx);
+ io_req_set_rsrc_node(req, ctx, issue_flags);
index = array_index_nospec(buf_index, ctx->nr_user_bufs);
imu = READ_ONCE(ctx->user_bufs[index]);
req->imu = imu;
@@ -3201,30 +3328,36 @@ static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
mutex_lock(&ctx->uring_lock);
}
+static void io_buffer_add_list(struct io_ring_ctx *ctx,
+ struct io_buffer_list *bl, unsigned int bgid)
+{
+ struct list_head *list;
+
+ list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)];
+ INIT_LIST_HEAD(&bl->buf_list);
+ bl->bgid = bgid;
+ list_add(&bl->list, list);
+}
+
static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
int bgid, unsigned int issue_flags)
{
struct io_buffer *kbuf = req->kbuf;
- struct io_buffer *head;
bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_buffer_list *bl;
if (req->flags & REQ_F_BUFFER_SELECTED)
return kbuf;
- io_ring_submit_lock(req->ctx, needs_lock);
+ io_ring_submit_lock(ctx, needs_lock);
- lockdep_assert_held(&req->ctx->uring_lock);
+ lockdep_assert_held(&ctx->uring_lock);
- head = xa_load(&req->ctx->io_buffers, bgid);
- if (head) {
- if (!list_empty(&head->list)) {
- kbuf = list_last_entry(&head->list, struct io_buffer,
- list);
- list_del(&kbuf->list);
- } else {
- kbuf = head;
- xa_erase(&req->ctx->io_buffers, bgid);
- }
+ bl = io_buffer_get_list(ctx, bgid);
+ if (bl && !list_empty(&bl->buf_list)) {
+ kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
+ list_del(&kbuf->list);
if (*len > kbuf->len)
*len = kbuf->len;
req->flags |= REQ_F_BUFFER_SELECTED;
@@ -3331,7 +3464,7 @@ static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
ssize_t ret;
if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
- ret = io_import_fixed(req, rw, iter);
+ ret = io_import_fixed(req, rw, iter, issue_flags);
if (ret)
return ERR_PTR(ret);
return NULL;
@@ -3400,6 +3533,7 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
struct kiocb *kiocb = &req->rw.kiocb;
struct file *file = req->file;
ssize_t ret = 0;
+ loff_t *ppos;
/*
* Don't support polled IO through this interface, and we can't
@@ -3412,6 +3546,8 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
!(kiocb->ki_filp->f_flags & O_NONBLOCK))
return -EAGAIN;
+ ppos = io_kiocb_ppos(kiocb);
+
while (iov_iter_count(iter)) {
struct iovec iovec;
ssize_t nr;
@@ -3425,10 +3561,10 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
if (rw == READ) {
nr = file->f_op->read(file, iovec.iov_base,
- iovec.iov_len, io_kiocb_ppos(kiocb));
+ iovec.iov_len, ppos);
} else {
nr = file->f_op->write(file, iovec.iov_base,
- iovec.iov_len, io_kiocb_ppos(kiocb));
+ iovec.iov_len, ppos);
}
if (nr < 0) {
@@ -3436,13 +3572,15 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
ret = nr;
break;
}
+ ret += nr;
if (!iov_iter_is_bvec(iter)) {
iov_iter_advance(iter, nr);
} else {
- req->rw.len -= nr;
req->rw.addr += nr;
+ req->rw.len -= nr;
+ if (!req->rw.len)
+ break;
}
- ret += nr;
if (nr != iovec.iov_len)
break;
}
@@ -3527,13 +3665,6 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
return 0;
}
-static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- if (unlikely(!(req->file->f_mode & FMODE_READ)))
- return -EBADF;
- return io_prep_rw(req, sqe);
-}
-
/*
* This is our waitqueue callback handler, registered through __folio_lock_async()
* when we initially tried to do the IO with the iocb armed our waitqueue.
@@ -3621,6 +3752,50 @@ static bool need_read_all(struct io_kiocb *req)
S_ISBLK(file_inode(req->file)->i_mode);
}
+static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
+{
+ struct kiocb *kiocb = &req->rw.kiocb;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct file *file = req->file;
+ int ret;
+
+ if (unlikely(!file || !(file->f_mode & mode)))
+ return -EBADF;
+
+ if (!io_req_ffs_set(req))
+ req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
+
+ kiocb->ki_flags = iocb_flags(file);
+ ret = kiocb_set_rw_flags(kiocb, req->rw.flags);
+ if (unlikely(ret))
+ return ret;
+
+ /*
+ * If the file is marked O_NONBLOCK, still allow retry for it if it
+ * supports async. Otherwise it's impossible to use O_NONBLOCK files
+ * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
+ */
+ if ((kiocb->ki_flags & IOCB_NOWAIT) ||
+ ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req)))
+ req->flags |= REQ_F_NOWAIT;
+
+ if (ctx->flags & IORING_SETUP_IOPOLL) {
+ if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
+ return -EOPNOTSUPP;
+
+ kiocb->private = NULL;
+ kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
+ kiocb->ki_complete = io_complete_rw_iopoll;
+ req->iopoll_completed = 0;
+ } else {
+ if (kiocb->ki_flags & IOCB_HIPRI)
+ return -EINVAL;
+ kiocb->ki_complete = io_complete_rw;
+ }
+
+ return 0;
+}
+
static int io_read(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_rw_state __s, *s = &__s;
@@ -3629,12 +3804,23 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
struct io_async_rw *rw;
ssize_t ret, ret2;
+ loff_t *ppos;
if (!req_has_async_data(req)) {
ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
if (unlikely(ret < 0))
return ret;
} else {
+ /*
+ * Safe and required to re-import if we're using provided
+ * buffers, as we dropped the selected one before retry.
+ */
+ if (req->flags & REQ_F_BUFFER_SELECT) {
+ ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
+ if (unlikely(ret < 0))
+ return ret;
+ }
+
rw = req->async_data;
s = &rw->s;
/*
@@ -3645,6 +3831,11 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
iov_iter_restore(&s->iter, &s->iter_state);
iovec = NULL;
}
+ ret = io_rw_init_file(req, FMODE_READ);
+ if (unlikely(ret)) {
+ kfree(iovec);
+ return ret;
+ }
req->result = iov_iter_count(&s->iter);
if (force_nonblock) {
@@ -3659,7 +3850,9 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
kiocb->ki_flags &= ~IOCB_NOWAIT;
}
- ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result);
+ ppos = io_kiocb_update_pos(req);
+
+ ret = rw_verify_area(READ, req->file, ppos, req->result);
if (unlikely(ret)) {
kfree(iovec);
return ret;
@@ -3669,6 +3862,9 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
req->flags &= ~REQ_F_REISSUE;
+ /* if we can poll, just do that */
+ if (req->opcode == IORING_OP_READ && file_can_poll(req->file))
+ return -EAGAIN;
/* IOPOLL retry should happen for io-wq threads */
if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
goto done;
@@ -3743,14 +3939,6 @@ out_free:
return 0;
}
-static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
- return -EBADF;
- req->rw.kiocb.ki_hint = ki_hint_validate(file_write_hint(req->file));
- return io_prep_rw(req, sqe);
-}
-
static int io_write(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_rw_state __s, *s = &__s;
@@ -3758,6 +3946,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
struct kiocb *kiocb = &req->rw.kiocb;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
ssize_t ret, ret2;
+ loff_t *ppos;
if (!req_has_async_data(req)) {
ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags);
@@ -3770,6 +3959,11 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
iov_iter_restore(&s->iter, &s->iter_state);
iovec = NULL;
}
+ ret = io_rw_init_file(req, FMODE_WRITE);
+ if (unlikely(ret)) {
+ kfree(iovec);
+ return ret;
+ }
req->result = iov_iter_count(&s->iter);
if (force_nonblock) {
@@ -3788,7 +3982,9 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
kiocb->ki_flags &= ~IOCB_NOWAIT;
}
- ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result);
+ ppos = io_kiocb_update_pos(req);
+
+ ret = rw_verify_area(WRITE, req->file, ppos, req->result);
if (unlikely(ret))
goto out_free;
@@ -4138,18 +4334,11 @@ static int __io_splice_prep(struct io_kiocb *req,
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- sp->file_in = NULL;
sp->len = READ_ONCE(sqe->len);
sp->flags = READ_ONCE(sqe->splice_flags);
-
if (unlikely(sp->flags & ~valid_flags))
return -EINVAL;
-
- sp->file_in = io_file_get(req->ctx, req, READ_ONCE(sqe->splice_fd_in),
- (sp->flags & SPLICE_F_FD_IN_FIXED));
- if (!sp->file_in)
- return -EBADF;
- req->flags |= REQ_F_NEED_CLEANUP;
+ sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in);
return 0;
}
@@ -4164,20 +4353,29 @@ static int io_tee_prep(struct io_kiocb *req,
static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_splice *sp = &req->splice;
- struct file *in = sp->file_in;
struct file *out = sp->file_out;
unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
+ struct file *in;
long ret = 0;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
+
+ if (sp->flags & SPLICE_F_FD_IN_FIXED)
+ in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
+ else
+ in = io_file_get_normal(req, sp->splice_fd_in);
+ if (!in) {
+ ret = -EBADF;
+ goto done;
+ }
+
if (sp->len)
ret = do_tee(in, out, sp->len, flags);
if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
io_put_file(in);
- req->flags &= ~REQ_F_NEED_CLEANUP;
-
+done:
if (ret != sp->len)
req_set_fail(req);
io_req_complete(req, ret);
@@ -4196,15 +4394,24 @@ static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_splice *sp = &req->splice;
- struct file *in = sp->file_in;
struct file *out = sp->file_out;
unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
loff_t *poff_in, *poff_out;
+ struct file *in;
long ret = 0;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
+ if (sp->flags & SPLICE_F_FD_IN_FIXED)
+ in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
+ else
+ in = io_file_get_normal(req, sp->splice_fd_in);
+ if (!in) {
+ ret = -EBADF;
+ goto done;
+ }
+
poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
@@ -4213,8 +4420,7 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
io_put_file(in);
- req->flags &= ~REQ_F_NEED_CLEANUP;
-
+done:
if (ret != sp->len)
req_set_fail(req);
io_req_complete(req, ret);
@@ -4235,13 +4441,53 @@ static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
return 0;
}
+static int io_msg_ring_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ if (unlikely(sqe->addr || sqe->ioprio || sqe->rw_flags ||
+ sqe->splice_fd_in || sqe->buf_index || sqe->personality))
+ return -EINVAL;
+
+ req->msg.user_data = READ_ONCE(sqe->off);
+ req->msg.len = READ_ONCE(sqe->len);
+ return 0;
+}
+
+static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_ring_ctx *target_ctx;
+ struct io_msg *msg = &req->msg;
+ bool filled;
+ int ret;
+
+ ret = -EBADFD;
+ if (req->file->f_op != &io_uring_fops)
+ goto done;
+
+ ret = -EOVERFLOW;
+ target_ctx = req->file->private_data;
+
+ spin_lock(&target_ctx->completion_lock);
+ filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len, 0);
+ io_commit_cqring(target_ctx);
+ spin_unlock(&target_ctx->completion_lock);
+
+ if (filled) {
+ io_cqring_ev_posted(target_ctx);
+ ret = 0;
+ }
+
+done:
+ if (ret < 0)
+ req_set_fail(req);
+ __io_req_complete(req, issue_flags, ret, 0);
+ return 0;
+}
+
static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
- if (!req->file)
- return -EBADF;
-
if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
@@ -4301,6 +4547,8 @@ static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
req->sync.len);
if (ret < 0)
req_set_fail(req);
+ else
+ fsnotify_modify(req->file);
io_req_complete(req, ret);
return 0;
}
@@ -4458,8 +4706,8 @@ static int io_remove_buffers_prep(struct io_kiocb *req,
return 0;
}
-static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
- int bgid, unsigned nbufs)
+static int __io_remove_buffers(struct io_ring_ctx *ctx,
+ struct io_buffer_list *bl, unsigned nbufs)
{
unsigned i = 0;
@@ -4468,19 +4716,16 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
return 0;
/* the head kbuf is the list itself */
- while (!list_empty(&buf->list)) {
+ while (!list_empty(&bl->buf_list)) {
struct io_buffer *nxt;
- nxt = list_first_entry(&buf->list, struct io_buffer, list);
+ nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
list_del(&nxt->list);
- kfree(nxt);
if (++i == nbufs)
return i;
cond_resched();
}
i++;
- kfree(buf);
- xa_erase(&ctx->io_buffers, bgid);
return i;
}
@@ -4489,7 +4734,7 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_provide_buf *p = &req->pbuf;
struct io_ring_ctx *ctx = req->ctx;
- struct io_buffer *head;
+ struct io_buffer_list *bl;
int ret = 0;
bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
@@ -4498,9 +4743,9 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
lockdep_assert_held(&ctx->uring_lock);
ret = -ENOENT;
- head = xa_load(&ctx->io_buffers, p->bgid);
- if (head)
- ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
+ bl = io_buffer_get_list(ctx, p->bgid);
+ if (bl)
+ ret = __io_remove_buffers(ctx, bl, p->nbufs);
if (ret < 0)
req_set_fail(req);
@@ -4545,39 +4790,80 @@ static int io_provide_buffers_prep(struct io_kiocb *req,
return 0;
}
-static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
+static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
+{
+ struct io_buffer *buf;
+ struct page *page;
+ int bufs_in_page;
+
+ /*
+ * Completions that don't happen inline (eg not under uring_lock) will
+ * add to ->io_buffers_comp. If we don't have any free buffers, check
+ * the completion list and splice those entries first.
+ */
+ if (!list_empty_careful(&ctx->io_buffers_comp)) {
+ spin_lock(&ctx->completion_lock);
+ if (!list_empty(&ctx->io_buffers_comp)) {
+ list_splice_init(&ctx->io_buffers_comp,
+ &ctx->io_buffers_cache);
+ spin_unlock(&ctx->completion_lock);
+ return 0;
+ }
+ spin_unlock(&ctx->completion_lock);
+ }
+
+ /*
+ * No free buffers and no completion entries either. Allocate a new
+ * page worth of buffer entries and add those to our freelist.
+ */
+ page = alloc_page(GFP_KERNEL_ACCOUNT);
+ if (!page)
+ return -ENOMEM;
+
+ list_add(&page->lru, &ctx->io_buffers_pages);
+
+ buf = page_address(page);
+ bufs_in_page = PAGE_SIZE / sizeof(*buf);
+ while (bufs_in_page) {
+ list_add_tail(&buf->list, &ctx->io_buffers_cache);
+ buf++;
+ bufs_in_page--;
+ }
+
+ return 0;
+}
+
+static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
+ struct io_buffer_list *bl)
{
struct io_buffer *buf;
u64 addr = pbuf->addr;
int i, bid = pbuf->bid;
for (i = 0; i < pbuf->nbufs; i++) {
- buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
- if (!buf)
+ if (list_empty(&ctx->io_buffers_cache) &&
+ io_refill_buffer_cache(ctx))
break;
-
+ buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer,
+ list);
+ list_move_tail(&buf->list, &bl->buf_list);
buf->addr = addr;
buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
buf->bid = bid;
+ buf->bgid = pbuf->bgid;
addr += pbuf->len;
bid++;
- if (!*head) {
- INIT_LIST_HEAD(&buf->list);
- *head = buf;
- } else {
- list_add_tail(&buf->list, &(*head)->list);
- }
cond_resched();
}
- return i ? i : -ENOMEM;
+ return i ? 0 : -ENOMEM;
}
static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_provide_buf *p = &req->pbuf;
struct io_ring_ctx *ctx = req->ctx;
- struct io_buffer *head, *list;
+ struct io_buffer_list *bl;
int ret = 0;
bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
@@ -4585,14 +4871,18 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
lockdep_assert_held(&ctx->uring_lock);
- list = head = xa_load(&ctx->io_buffers, p->bgid);
-
- ret = io_add_buffers(p, &head);
- if (ret >= 0 && !list) {
- ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL);
- if (ret < 0)
- __io_remove_buffers(ctx, head, p->bgid, -1U);
+ bl = io_buffer_get_list(ctx, p->bgid);
+ if (unlikely(!bl)) {
+ bl = kmalloc(sizeof(*bl), GFP_KERNEL);
+ if (!bl) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ io_buffer_add_list(ctx, bl, p->bgid);
}
+
+ ret = io_add_buffers(ctx, p, bl);
+err:
if (ret < 0)
req_set_fail(req);
/* complete before unlock, IOPOLL may need the lock */
@@ -4722,6 +5012,8 @@ static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
+ const char __user *path;
+
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
@@ -4731,10 +5023,22 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
req->statx.dfd = READ_ONCE(sqe->fd);
req->statx.mask = READ_ONCE(sqe->len);
- req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ path = u64_to_user_ptr(READ_ONCE(sqe->addr));
req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
req->statx.flags = READ_ONCE(sqe->statx_flags);
+ req->statx.filename = getname_flags(path,
+ getname_statx_lookup_flags(req->statx.flags),
+ NULL);
+
+ if (IS_ERR(req->statx.filename)) {
+ int ret = PTR_ERR(req->statx.filename);
+
+ req->statx.filename = NULL;
+ return ret;
+ }
+
+ req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
@@ -4904,6 +5208,8 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
+ if (unlikely(sqe->addr2 || sqe->file_index))
+ return -EINVAL;
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
@@ -5115,6 +5421,8 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
+ if (unlikely(sqe->addr2 || sqe->file_index))
+ return -EINVAL;
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
@@ -5127,12 +5435,21 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (req->ctx->compat)
sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
+ sr->done_io = 0;
return 0;
}
+static bool io_net_retry(struct socket *sock, int flags)
+{
+ if (!(flags & MSG_WAITALL))
+ return false;
+ return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
+}
+
static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_async_msghdr iomsg, *kmsg;
+ struct io_sr_msg *sr = &req->sr_msg;
struct socket *sock;
struct io_buffer *kbuf;
unsigned flags;
@@ -5175,6 +5492,11 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
return io_setup_async_msg(req, kmsg);
if (ret == -ERESTARTSYS)
ret = -EINTR;
+ if (ret > 0 && io_net_retry(sock, flags)) {
+ sr->done_io += ret;
+ req->flags |= REQ_F_PARTIAL_IO;
+ return io_setup_async_msg(req, kmsg);
+ }
req_set_fail(req);
} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
req_set_fail(req);
@@ -5184,7 +5506,11 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
if (kmsg->free_iov)
kfree(kmsg->free_iov);
req->flags &= ~REQ_F_NEED_CLEANUP;
- __io_req_complete(req, issue_flags, ret, io_put_kbuf(req));
+ if (ret >= 0)
+ ret += sr->done_io;
+ else if (sr->done_io)
+ ret = sr->done_io;
+ __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
return 0;
}
@@ -5234,12 +5560,24 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
return -EAGAIN;
if (ret == -ERESTARTSYS)
ret = -EINTR;
+ if (ret > 0 && io_net_retry(sock, flags)) {
+ sr->len -= ret;
+ sr->buf += ret;
+ sr->done_io += ret;
+ req->flags |= REQ_F_PARTIAL_IO;
+ return -EAGAIN;
+ }
req_set_fail(req);
} else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
out_free:
req_set_fail(req);
}
- __io_req_complete(req, issue_flags, ret, io_put_kbuf(req));
+
+ if (ret >= 0)
+ ret += sr->done_io;
+ else if (sr->done_io)
+ ret = sr->done_io;
+ __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
return 0;
}
@@ -5258,8 +5596,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
accept->nofile = rlimit(RLIMIT_NOFILE);
accept->file_slot = READ_ONCE(sqe->file_index);
- if (accept->file_slot && ((req->open.how.flags & O_CLOEXEC) ||
- (accept->flags & SOCK_CLOEXEC)))
+ if (accept->file_slot && (accept->flags & SOCK_CLOEXEC))
return -EINVAL;
if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return -EINVAL;
@@ -5277,9 +5614,6 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
struct file *file;
int ret, fd;
- if (req->file->f_flags & O_NONBLOCK)
- req->flags |= REQ_F_NOWAIT;
-
if (!fixed) {
fd = __get_unused_fd_flags(accept->flags, accept->nofile);
if (unlikely(fd < 0))
@@ -5407,7 +5741,7 @@ struct io_poll_table {
};
#define IO_POLL_CANCEL_FLAG BIT(31)
-#define IO_POLL_REF_MASK ((1u << 20)-1)
+#define IO_POLL_REF_MASK GENMASK(30, 0)
/*
* If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
@@ -5474,8 +5808,12 @@ static inline void io_poll_remove_entry(struct io_poll_iocb *poll)
static void io_poll_remove_entries(struct io_kiocb *req)
{
- struct io_poll_iocb *poll = io_poll_get_single(req);
- struct io_poll_iocb *poll_double = io_poll_get_double(req);
+ /*
+ * Nothing to do if neither of those flags are set. Avoid dipping
+ * into the poll/apoll/double cachelines if we can.
+ */
+ if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL)))
+ return;
/*
* While we hold the waitqueue lock and the waitqueue is nonempty,
@@ -5493,9 +5831,10 @@ static void io_poll_remove_entries(struct io_kiocb *req)
* In that case, only RCU prevents the queue memory from being freed.
*/
rcu_read_lock();
- io_poll_remove_entry(poll);
- if (poll_double)
- io_poll_remove_entry(poll_double);
+ if (req->flags & REQ_F_SINGLE_POLL)
+ io_poll_remove_entry(io_poll_get_single(req));
+ if (req->flags & REQ_F_DOUBLE_POLL)
+ io_poll_remove_entry(io_poll_get_double(req));
rcu_read_unlock();
}
@@ -5507,10 +5846,9 @@ static void io_poll_remove_entries(struct io_kiocb *req)
* either spurious wakeup or multishot CQE is served. 0 when it's done with
* the request, then the mask is stored in req->result.
*/
-static int io_poll_check_events(struct io_kiocb *req)
+static int io_poll_check_events(struct io_kiocb *req, bool locked)
{
struct io_ring_ctx *ctx = req->ctx;
- struct io_poll_iocb *poll = io_poll_get_single(req);
int v;
/* req->task == current here, checking PF_EXITING is safe */
@@ -5527,14 +5865,17 @@ static int io_poll_check_events(struct io_kiocb *req)
return -ECANCELED;
if (!req->result) {
- struct poll_table_struct pt = { ._key = poll->events };
+ struct poll_table_struct pt = { ._key = req->apoll_events };
+ unsigned flags = locked ? 0 : IO_URING_F_UNLOCKED;
- req->result = vfs_poll(req->file, &pt) & poll->events;
+ if (unlikely(!io_assign_file(req, flags)))
+ return -EBADF;
+ req->result = vfs_poll(req->file, &pt) & req->apoll_events;
}
/* multishot, just fill an CQE and proceed */
- if (req->result && !(poll->events & EPOLLONESHOT)) {
- __poll_t mask = mangle_poll(req->result & poll->events);
+ if (req->result && !(req->apoll_events & EPOLLONESHOT)) {
+ __poll_t mask = mangle_poll(req->result & req->apoll_events);
bool filled;
spin_lock(&ctx->completion_lock);
@@ -5563,7 +5904,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
struct io_ring_ctx *ctx = req->ctx;
int ret;
- ret = io_poll_check_events(req);
+ ret = io_poll_check_events(req, *locked);
if (ret > 0)
return;
@@ -5588,7 +5929,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
struct io_ring_ctx *ctx = req->ctx;
int ret;
- ret = io_poll_check_events(req);
+ ret = io_poll_check_events(req, *locked);
if (ret > 0)
return;
@@ -5603,35 +5944,45 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
io_req_complete_failed(req, ret);
}
-static void __io_poll_execute(struct io_kiocb *req, int mask)
+static void __io_poll_execute(struct io_kiocb *req, int mask, int events)
{
req->result = mask;
+ /*
+ * This is useful for poll that is armed on behalf of another
+ * request, and where the wakeup path could be on a different
+ * CPU. We want to avoid pulling in req->apoll->events for that
+ * case.
+ */
+ req->apoll_events = events;
if (req->opcode == IORING_OP_POLL_ADD)
req->io_task_work.func = io_poll_task_func;
else
req->io_task_work.func = io_apoll_task_func;
- trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
+ trace_io_uring_task_add(req->ctx, req, req->user_data, req->opcode, mask);
io_req_task_work_add(req, false);
}
-static inline void io_poll_execute(struct io_kiocb *req, int res)
+static inline void io_poll_execute(struct io_kiocb *req, int res, int events)
{
if (io_poll_get_ownership(req))
- __io_poll_execute(req, res);
+ __io_poll_execute(req, res, events);
}
static void io_poll_cancel_req(struct io_kiocb *req)
{
io_poll_mark_cancelled(req);
/* kick tw, which should complete the request */
- io_poll_execute(req, 0);
+ io_poll_execute(req, 0, 0);
}
+#define wqe_to_req(wait) ((void *)((unsigned long) (wait)->private & ~1))
+#define wqe_is_double(wait) ((unsigned long) (wait)->private & 1)
+
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
void *key)
{
- struct io_kiocb *req = wait->private;
+ struct io_kiocb *req = wqe_to_req(wait);
struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
wait);
__poll_t mask = key_to_poll(key);
@@ -5639,7 +5990,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
if (unlikely(mask & POLLFREE)) {
io_poll_mark_cancelled(req);
/* we have to kick tw in case it's not already */
- io_poll_execute(req, 0);
+ io_poll_execute(req, 0, poll->events);
/*
* If the waitqueue is being freed early but someone is already
@@ -5669,8 +6020,12 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
if (mask && poll->events & EPOLLONESHOT) {
list_del_init(&poll->wait.entry);
poll->head = NULL;
+ if (wqe_is_double(wait))
+ req->flags &= ~REQ_F_DOUBLE_POLL;
+ else
+ req->flags &= ~REQ_F_SINGLE_POLL;
}
- __io_poll_execute(req, mask);
+ __io_poll_execute(req, mask, poll->events);
}
return 1;
}
@@ -5680,6 +6035,7 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
struct io_poll_iocb **poll_ptr)
{
struct io_kiocb *req = pt->req;
+ unsigned long wqe_private = (unsigned long) req;
/*
* The file being polled uses multiple waitqueues for poll handling
@@ -5705,15 +6061,19 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
pt->error = -ENOMEM;
return;
}
+ /* mark as double wq entry */
+ wqe_private |= 1;
+ req->flags |= REQ_F_DOUBLE_POLL;
io_init_poll_iocb(poll, first->events, first->wait.func);
*poll_ptr = poll;
if (req->opcode == IORING_OP_POLL_ADD)
req->flags |= REQ_F_ASYNC_DATA;
}
+ req->flags |= REQ_F_SINGLE_POLL;
pt->nr_entries++;
poll->head = head;
- poll->wait.private = req;
+ poll->wait.private = (void *) wqe_private;
if (poll->events & EPOLLEXCLUSIVE)
add_wait_queue_exclusive(head, &poll->wait);
@@ -5740,7 +6100,6 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
INIT_HLIST_NODE(&req->hash_node);
io_init_poll_iocb(poll, mask, io_poll_wake);
poll->file = req->file;
- poll->wait.private = req;
ipt->pt._key = mask;
ipt->req = req;
@@ -5774,7 +6133,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
/* can't multishot if failed, just queue the event we've got */
if (unlikely(ipt->error || !ipt->nr_entries))
poll->events |= EPOLLONESHOT;
- __io_poll_execute(req, mask);
+ __io_poll_execute(req, mask, poll->events);
return 0;
}
@@ -5784,7 +6143,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
*/
v = atomic_dec_return(&req->poll_refs);
if (unlikely(v & IO_POLL_REF_MASK))
- __io_poll_execute(req, 0);
+ __io_poll_execute(req, 0, poll->events);
return 0;
}
@@ -5803,7 +6162,7 @@ enum {
IO_APOLL_READY
};
-static int io_arm_poll_handler(struct io_kiocb *req)
+static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
struct io_ring_ctx *ctx = req->ctx;
@@ -5827,20 +6186,30 @@ static int io_arm_poll_handler(struct io_kiocb *req)
} else {
mask |= POLLOUT | POLLWRNORM;
}
-
- apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
- if (unlikely(!apoll))
- return IO_APOLL_ABORTED;
+ if (def->poll_exclusive)
+ mask |= EPOLLEXCLUSIVE;
+ if (!(issue_flags & IO_URING_F_UNLOCKED) &&
+ !list_empty(&ctx->apoll_cache)) {
+ apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
+ poll.wait.entry);
+ list_del_init(&apoll->poll.wait.entry);
+ } else {
+ apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
+ if (unlikely(!apoll))
+ return IO_APOLL_ABORTED;
+ }
apoll->double_poll = NULL;
req->apoll = apoll;
req->flags |= REQ_F_POLLED;
ipt.pt._qproc = io_async_queue_proc;
+ io_kbuf_recycle(req, issue_flags);
+
ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask);
if (ret || ipt.error)
return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
- trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data,
+ trace_io_uring_poll_arm(ctx, req, req->user_data, req->opcode,
mask, apoll->poll.events);
return IO_APOLL_OK;
}
@@ -5863,6 +6232,7 @@ static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,
list = &ctx->cancel_hash[i];
hlist_for_each_entry_safe(req, tmp, list, hash_node) {
if (io_match_task_safe(req, tsk, cancel_all)) {
+ hlist_del_init(&req->hash_node);
io_poll_cancel_req(req);
found = true;
}
@@ -5975,7 +6345,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
return -EINVAL;
io_req_set_refcount(req);
- poll->events = io_poll_parse_events(sqe, flags);
+ req->apoll_events = poll->events = io_poll_parse_events(sqe, flags);
return 0;
}
@@ -6092,10 +6462,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
if (IS_ERR(req))
return PTR_ERR(req);
-
- req_set_fail(req);
- io_fill_cqe_req(req, -ECANCELED, 0);
- io_put_req_deferred(req);
+ io_req_task_queue_fail(req, -ECANCELED);
return 0;
}
@@ -6273,6 +6640,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0)
return -EINVAL;
+ INIT_LIST_HEAD(&req->timeout.list);
data->mode = io_translate_timeout_mode(flags);
hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
@@ -6479,6 +6847,7 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
up.nr = 0;
up.tags = 0;
up.resv = 0;
+ up.resv2 = 0;
io_ring_submit_lock(ctx, needs_lock);
ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
@@ -6499,11 +6868,10 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
case IORING_OP_READV:
case IORING_OP_READ_FIXED:
case IORING_OP_READ:
- return io_read_prep(req, sqe);
case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
case IORING_OP_WRITE:
- return io_write_prep(req, sqe);
+ return io_prep_rw(req, sqe);
case IORING_OP_POLL_ADD:
return io_poll_add_prep(req, sqe);
case IORING_OP_POLL_REMOVE:
@@ -6568,6 +6936,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return io_symlinkat_prep(req, sqe);
case IORING_OP_LINKAT:
return io_linkat_prep(req, sqe);
+ case IORING_OP_MSG_RING:
+ return io_msg_ring_prep(req, sqe);
}
printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
@@ -6649,7 +7019,7 @@ fail:
goto queue;
}
- trace_io_uring_defer(ctx, req, req->user_data);
+ trace_io_uring_defer(ctx, req, req->user_data, req->opcode);
de->req = req;
de->seq = seq;
list_add_tail(&de->list, &ctx->defer_list);
@@ -6658,8 +7028,11 @@ fail:
static void io_clean_op(struct io_kiocb *req)
{
- if (req->flags & REQ_F_BUFFER_SELECTED)
- io_put_kbuf(req);
+ if (req->flags & REQ_F_BUFFER_SELECTED) {
+ spin_lock(&req->ctx->completion_lock);
+ io_put_kbuf_comp(req);
+ spin_unlock(&req->ctx->completion_lock);
+ }
if (req->flags & REQ_F_NEED_CLEANUP) {
switch (req->opcode) {
@@ -6681,11 +7054,6 @@ static void io_clean_op(struct io_kiocb *req)
kfree(io->free_iov);
break;
}
- case IORING_OP_SPLICE:
- case IORING_OP_TEE:
- if (!(req->splice.flags & SPLICE_F_FD_IN_FIXED))
- io_put_file(req->splice.file_in);
- break;
case IORING_OP_OPENAT:
case IORING_OP_OPENAT2:
if (req->open.filename)
@@ -6709,6 +7077,10 @@ static void io_clean_op(struct io_kiocb *req)
putname(req->hardlink.oldpath);
putname(req->hardlink.newpath);
break;
+ case IORING_OP_STATX:
+ if (req->statx.filename)
+ putname(req->statx.filename);
+ break;
}
}
if ((req->flags & REQ_F_POLLED) && req->apoll) {
@@ -6716,11 +7088,6 @@ static void io_clean_op(struct io_kiocb *req)
kfree(req->apoll);
req->apoll = NULL;
}
- if (req->flags & REQ_F_INFLIGHT) {
- struct io_uring_task *tctx = req->task->io_uring;
-
- atomic_dec(&tctx->inflight_tracked);
- }
if (req->flags & REQ_F_CREDS)
put_cred(req->creds);
if (req->flags & REQ_F_ASYNC_DATA) {
@@ -6730,11 +7097,31 @@ static void io_clean_op(struct io_kiocb *req)
req->flags &= ~IO_REQ_CLEAN_FLAGS;
}
+static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags)
+{
+ if (req->file || !io_op_defs[req->opcode].needs_file)
+ return true;
+
+ if (req->flags & REQ_F_FIXED_FILE)
+ req->file = io_file_get_fixed(req, req->fd, issue_flags);
+ else
+ req->file = io_file_get_normal(req, req->fd);
+ if (req->file)
+ return true;
+
+ req_set_fail(req);
+ req->result = -EBADF;
+ return false;
+}
+
static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
{
const struct cred *creds = NULL;
int ret;
+ if (unlikely(!io_assign_file(req, issue_flags)))
+ return -EBADF;
+
if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
creds = override_creds(req->creds);
@@ -6851,6 +7238,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
case IORING_OP_LINKAT:
ret = io_linkat(req, issue_flags);
break;
+ case IORING_OP_MSG_RING:
+ ret = io_msg_ring(req, issue_flags);
+ break;
default:
ret = -EINVAL;
break;
@@ -6881,10 +7271,11 @@ static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
static void io_wq_submit_work(struct io_wq_work *work)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ const struct io_op_def *def = &io_op_defs[req->opcode];
unsigned int issue_flags = IO_URING_F_UNLOCKED;
bool needs_poll = false;
struct io_kiocb *timeout;
- int ret = 0;
+ int ret = 0, err = -ECANCELED;
/* one will be dropped by ->io_free_work() after returning to io-wq */
if (!(req->flags & REQ_F_REFCOUNT))
@@ -6896,14 +7287,20 @@ static void io_wq_submit_work(struct io_wq_work *work)
if (timeout)
io_queue_linked_timeout(timeout);
+
/* either cancelled or io-wq is dying, so don't touch tctx->iowq */
if (work->flags & IO_WQ_WORK_CANCEL) {
- io_req_task_queue_fail(req, -ECANCELED);
+fail:
+ io_req_task_queue_fail(req, err);
return;
}
+ if (!io_assign_file(req, issue_flags)) {
+ err = -EBADF;
+ work->flags |= IO_WQ_WORK_CANCEL;
+ goto fail;
+ }
if (req->flags & REQ_F_FORCE_ASYNC) {
- const struct io_op_def *def = &io_op_defs[req->opcode];
bool opcode_poll = def->pollin || def->pollout;
if (opcode_poll && file_can_poll(req->file)) {
@@ -6926,7 +7323,7 @@ static void io_wq_submit_work(struct io_wq_work *work)
continue;
}
- if (io_arm_poll_handler(req) == IO_APOLL_OK)
+ if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK)
return;
/* aborted or ready, in either case retry blocking */
needs_poll = false;
@@ -6960,46 +7357,56 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file
file_slot->file_ptr = file_ptr;
}
-static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
- struct io_kiocb *req, int fd)
+static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
+ unsigned int issue_flags)
{
- struct file *file;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct file *file = NULL;
unsigned long file_ptr;
+ if (issue_flags & IO_URING_F_UNLOCKED)
+ mutex_lock(&ctx->uring_lock);
+
if (unlikely((unsigned int)fd >= ctx->nr_user_files))
- return NULL;
+ goto out;
fd = array_index_nospec(fd, ctx->nr_user_files);
file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
file = (struct file *) (file_ptr & FFS_MASK);
file_ptr &= ~FFS_MASK;
/* mask in overlapping REQ_F and FFS bits */
req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
- io_req_set_rsrc_node(req, ctx);
+ io_req_set_rsrc_node(req, ctx, 0);
+out:
+ if (issue_flags & IO_URING_F_UNLOCKED)
+ mutex_unlock(&ctx->uring_lock);
return file;
}
-static struct file *io_file_get_normal(struct io_ring_ctx *ctx,
- struct io_kiocb *req, int fd)
+/*
+ * Drop the file for requeue operations. Only used of req->file is the
+ * io_uring descriptor itself.
+ */
+static void io_drop_inflight_file(struct io_kiocb *req)
+{
+ if (unlikely(req->flags & REQ_F_INFLIGHT)) {
+ fput(req->file);
+ req->file = NULL;
+ req->flags &= ~REQ_F_INFLIGHT;
+ }
+}
+
+static struct file *io_file_get_normal(struct io_kiocb *req, int fd)
{
struct file *file = fget(fd);
- trace_io_uring_file_get(ctx, fd);
+ trace_io_uring_file_get(req->ctx, req, req->user_data, fd);
/* we don't allow fixed io_uring files */
- if (file && unlikely(file->f_op == &io_uring_fops))
- io_req_track_inflight(req);
+ if (file && file->f_op == &io_uring_fops)
+ req->flags |= REQ_F_INFLIGHT;
return file;
}
-static inline struct file *io_file_get(struct io_ring_ctx *ctx,
- struct io_kiocb *req, int fd, bool fixed)
-{
- if (fixed)
- return io_file_get_fixed(ctx, req, fd);
- else
- return io_file_get_normal(ctx, req, fd);
-}
-
static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
{
struct io_kiocb *prev = req->timeout.prev;
@@ -7072,7 +7479,7 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
{
struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
- switch (io_arm_poll_handler(req)) {
+ switch (io_arm_poll_handler(req, 0)) {
case IO_APOLL_READY:
io_req_task_queue(req);
break;
@@ -7083,6 +7490,8 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
*/
io_queue_async_work(req, NULL);
break;
+ case IO_APOLL_OK:
+ break;
}
if (linked_timeout)
@@ -7237,6 +7646,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
if (io_op_defs[opcode].needs_file) {
struct io_submit_state *state = &ctx->submit_state;
+ req->fd = READ_ONCE(sqe->fd);
+
/*
* Plug now if we have more than 2 IO left after this, and the
* target is potentially a read/write to block based storage.
@@ -7246,11 +7657,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
state->need_plug = false;
blk_start_plug_nr_ios(&state->plug, state->submit_nr);
}
-
- req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
- (sqe_flags & IOSQE_FIXED_FILE));
- if (unlikely(!req->file))
- return -EBADF;
}
personality = READ_ONCE(sqe->personality);
@@ -7281,7 +7687,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
ret = io_init_req(ctx, req, sqe);
if (unlikely(ret)) {
- trace_io_uring_req_failed(sqe, ret);
+ trace_io_uring_req_failed(sqe, ctx, req, ret);
/* fail even hard links since we don't submit */
if (link->head) {
@@ -7308,7 +7714,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
}
/* don't need @sqe from now on */
- trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data,
+ trace_io_uring_submit_sqe(ctx, req, req->user_data, req->opcode,
req->flags, true,
ctx->flags & IORING_SETUP_SQPOLL);
@@ -7451,8 +7857,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
}
/* will complete beyond this point, count as submitted */
submitted++;
- if (io_submit_sqe(ctx, req, sqe))
- break;
+ if (io_submit_sqe(ctx, req, sqe)) {
+ /*
+ * Continue submitting even for sqe failure if the
+ * ring was setup with IORING_SETUP_SUBMIT_ALL
+ */
+ if (!(ctx->flags & IORING_SETUP_SUBMIT_ALL))
+ break;
+ }
} while (submitted < nr);
if (unlikely(submitted != nr)) {
@@ -7602,7 +8014,7 @@ static int io_sq_thread(void *data)
}
prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
- if (!io_sqd_events_pending(sqd) && !current->task_works) {
+ if (!io_sqd_events_pending(sqd) && !task_work_pending(current)) {
bool needs_sched = true;
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
@@ -7613,6 +8025,13 @@ static int io_sq_thread(void *data)
needs_sched = false;
break;
}
+
+ /*
+ * Ensure the store of the wakeup flag is not
+ * reordered with the load of the SQ tail
+ */
+ smp_mb();
+
if (io_sqring_entries(ctx)) {
needs_sched = false;
break;
@@ -7684,11 +8103,11 @@ static int io_run_task_work_sig(void)
{
if (io_run_task_work())
return 1;
- if (!signal_pending(current))
- return 0;
if (test_thread_flag(TIF_NOTIFY_SIGNAL))
return -ERESTARTSYS;
- return -EINTR;
+ if (task_sigpending(current))
+ return -EINTR;
+ return 0;
}
/* when returns >0, the caller should retry */
@@ -7732,14 +8151,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
break;
} while (1);
- if (uts) {
- struct timespec64 ts;
-
- if (get_timespec64(&ts, uts))
- return -EFAULT;
- timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
- }
-
if (sig) {
#ifdef CONFIG_COMPAT
if (in_compat_syscall())
@@ -7753,6 +8164,14 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return ret;
}
+ if (uts) {
+ struct timespec64 ts;
+
+ if (get_timespec64(&ts, uts))
+ return -EFAULT;
+ timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
+ }
+
init_waitqueue_func_entry(&iowq.wq, io_wake_function);
iowq.wq.private = current;
INIT_LIST_HEAD(&iowq.wq.entry);
@@ -8229,10 +8648,15 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
refcount_add(skb->truesize, &sk->sk_wmem_alloc);
skb_queue_head(&sk->sk_receive_queue, skb);
- for (i = 0; i < nr_files; i++)
- fput(fpl->fp[i]);
+ for (i = 0; i < nr; i++) {
+ struct file *file = io_file_from_index(ctx, i + offset);
+
+ if (file)
+ fput(file);
+ }
} else {
kfree_skb(skb);
+ free_uid(fpl->user);
kfree(fpl);
}
@@ -8520,13 +8944,15 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
struct io_rsrc_node *node, void *rsrc)
{
+ u64 *tag_slot = io_get_tag_slot(data, idx);
struct io_rsrc_put *prsrc;
prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
if (!prsrc)
return -ENOMEM;
- prsrc->tag = *io_get_tag_slot(data, idx);
+ prsrc->tag = *tag_slot;
+ *tag_slot = 0;
prsrc->rsrc = rsrc;
list_add(&prsrc->list, &node->rsrc_list);
return 0;
@@ -8595,7 +9021,7 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
struct io_fixed_file *file_slot;
struct file *file;
- int ret, i;
+ int ret;
io_ring_submit_lock(ctx, needs_lock);
ret = -ENXIO;
@@ -8608,8 +9034,8 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
if (ret)
goto out;
- i = array_index_nospec(offset, ctx->nr_user_files);
- file_slot = io_fixed_file_slot(&ctx->file_table, i);
+ offset = array_index_nospec(offset, ctx->nr_user_files);
+ file_slot = io_fixed_file_slot(&ctx->file_table, offset);
ret = -EBADF;
if (!file_slot->file_ptr)
goto out;
@@ -8665,8 +9091,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
if (file_slot->file_ptr) {
file = (struct file *)(file_slot->file_ptr & FFS_MASK);
- err = io_queue_rsrc_removal(data, up->offset + done,
- ctx->rsrc_node, file);
+ err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file);
if (err)
break;
file_slot->file_ptr = 0;
@@ -8691,7 +9116,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
err = -EBADF;
break;
}
- *io_get_tag_slot(data, up->offset + done) = tag;
+ *io_get_tag_slot(data, i) = tag;
io_fixed_file_set(file_slot, file);
err = io_sqe_file_register(ctx, file, i);
if (err) {
@@ -8749,8 +9174,16 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
if (unlikely(!tctx))
return -ENOMEM;
+ tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX,
+ sizeof(struct file *), GFP_KERNEL);
+ if (unlikely(!tctx->registered_rings)) {
+ kfree(tctx);
+ return -ENOMEM;
+ }
+
ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
if (unlikely(ret)) {
+ kfree(tctx->registered_rings);
kfree(tctx);
return ret;
}
@@ -8759,6 +9192,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
if (IS_ERR(tctx->io_wq)) {
ret = PTR_ERR(tctx->io_wq);
percpu_counter_destroy(&tctx->inflight);
+ kfree(tctx->registered_rings);
kfree(tctx);
return ret;
}
@@ -8766,7 +9200,6 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
xa_init(&tctx->xa);
init_waitqueue_head(&tctx->wait);
atomic_set(&tctx->in_idle, 0);
- atomic_set(&tctx->inflight_tracked, 0);
task->io_uring = tctx;
spin_lock_init(&tctx->task_lock);
INIT_WQ_LIST(&tctx->task_list);
@@ -8783,6 +9216,7 @@ void __io_uring_free(struct task_struct *tsk)
WARN_ON_ONCE(tctx->io_wq);
WARN_ON_ONCE(tctx->cached_refs);
+ kfree(tctx->registered_rings);
percpu_counter_destroy(&tctx->inflight);
kfree(tctx);
tsk->io_uring = NULL;
@@ -9340,7 +9774,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
i = array_index_nospec(offset, ctx->nr_user_bufs);
if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
- err = io_queue_rsrc_removal(ctx->buf_data, offset,
+ err = io_queue_rsrc_removal(ctx->buf_data, i,
ctx->rsrc_node, ctx->user_bufs[i]);
if (unlikely(err)) {
io_buffer_unmap(ctx, &imu);
@@ -9359,33 +9793,55 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
return done ? done : err;
}
-static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
+static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned int eventfd_async)
{
+ struct io_ev_fd *ev_fd;
__s32 __user *fds = arg;
int fd;
- if (ctx->cq_ev_fd)
+ ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
+ lockdep_is_held(&ctx->uring_lock));
+ if (ev_fd)
return -EBUSY;
if (copy_from_user(&fd, fds, sizeof(*fds)))
return -EFAULT;
- ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
- if (IS_ERR(ctx->cq_ev_fd)) {
- int ret = PTR_ERR(ctx->cq_ev_fd);
+ ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
+ if (!ev_fd)
+ return -ENOMEM;
- ctx->cq_ev_fd = NULL;
+ ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
+ if (IS_ERR(ev_fd->cq_ev_fd)) {
+ int ret = PTR_ERR(ev_fd->cq_ev_fd);
+ kfree(ev_fd);
return ret;
}
-
+ ev_fd->eventfd_async = eventfd_async;
+ ctx->has_evfd = true;
+ rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
return 0;
}
+static void io_eventfd_put(struct rcu_head *rcu)
+{
+ struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
+
+ eventfd_ctx_put(ev_fd->cq_ev_fd);
+ kfree(ev_fd);
+}
+
static int io_eventfd_unregister(struct io_ring_ctx *ctx)
{
- if (ctx->cq_ev_fd) {
- eventfd_ctx_put(ctx->cq_ev_fd);
- ctx->cq_ev_fd = NULL;
+ struct io_ev_fd *ev_fd;
+
+ ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
+ lockdep_is_held(&ctx->uring_lock));
+ if (ev_fd) {
+ ctx->has_evfd = false;
+ rcu_assign_pointer(ctx->io_ev_fd, NULL);
+ call_rcu(&ev_fd->rcu, io_eventfd_put);
return 0;
}
@@ -9394,11 +9850,28 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx)
static void io_destroy_buffers(struct io_ring_ctx *ctx)
{
- struct io_buffer *buf;
- unsigned long index;
+ int i;
- xa_for_each(&ctx->io_buffers, index, buf)
- __io_remove_buffers(ctx, buf, index, -1U);
+ for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) {
+ struct list_head *list = &ctx->io_buffers[i];
+
+ while (!list_empty(list)) {
+ struct io_buffer_list *bl;
+
+ bl = list_first_entry(list, struct io_buffer_list, list);
+ __io_remove_buffers(ctx, bl, -1U);
+ list_del(&bl->list);
+ kfree(bl);
+ }
+ }
+
+ while (!list_empty(&ctx->io_buffers_pages)) {
+ struct page *page;
+
+ page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
+ list_del_init(&page->lru);
+ __free_page(page);
+ }
}
static void io_req_caches_free(struct io_ring_ctx *ctx)
@@ -9429,6 +9902,18 @@ static void io_wait_rsrc_data(struct io_rsrc_data *data)
wait_for_completion(&data->done);
}
+static void io_flush_apoll_cache(struct io_ring_ctx *ctx)
+{
+ struct async_poll *apoll;
+
+ while (!list_empty(&ctx->apoll_cache)) {
+ apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
+ poll.wait.entry);
+ list_del(&apoll->poll.wait.entry);
+ kfree(apoll);
+ }
+}
+
static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
io_sq_thread_finish(ctx);
@@ -9450,8 +9935,9 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
__io_sqe_files_unregister(ctx);
if (ctx->rings)
__io_cqring_overflow_flush(ctx, true);
- mutex_unlock(&ctx->uring_lock);
io_eventfd_unregister(ctx);
+ io_flush_apoll_cache(ctx);
+ mutex_unlock(&ctx->uring_lock);
io_destroy_buffers(ctx);
if (ctx->sq_creds)
put_cred(ctx->sq_creds);
@@ -9485,6 +9971,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_wq_put_hash(ctx->hash_map);
kfree(ctx->cancel_hash);
kfree(ctx->dummy_ubuf);
+ kfree(ctx->io_buffers);
kfree(ctx);
}
@@ -9904,7 +10391,7 @@ static __cold void io_uring_clean_tctx(struct io_uring_task *tctx)
static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
{
if (tracked)
- return atomic_read(&tctx->inflight_tracked);
+ return 0;
return percpu_counter_sum(&tctx->inflight);
}
@@ -9983,6 +10470,144 @@ void __io_uring_cancel(bool cancel_all)
io_uring_cancel_generic(cancel_all, NULL);
}
+void io_uring_unreg_ringfd(void)
+{
+ struct io_uring_task *tctx = current->io_uring;
+ int i;
+
+ for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
+ if (tctx->registered_rings[i]) {
+ fput(tctx->registered_rings[i]);
+ tctx->registered_rings[i] = NULL;
+ }
+ }
+}
+
+static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
+ int start, int end)
+{
+ struct file *file;
+ int offset;
+
+ for (offset = start; offset < end; offset++) {
+ offset = array_index_nospec(offset, IO_RINGFD_REG_MAX);
+ if (tctx->registered_rings[offset])
+ continue;
+
+ file = fget(fd);
+ if (!file) {
+ return -EBADF;
+ } else if (file->f_op != &io_uring_fops) {
+ fput(file);
+ return -EOPNOTSUPP;
+ }
+ tctx->registered_rings[offset] = file;
+ return offset;
+ }
+
+ return -EBUSY;
+}
+
+/*
+ * Register a ring fd to avoid fdget/fdput for each io_uring_enter()
+ * invocation. User passes in an array of struct io_uring_rsrc_update
+ * with ->data set to the ring_fd, and ->offset given for the desired
+ * index. If no index is desired, application may set ->offset == -1U
+ * and we'll find an available index. Returns number of entries
+ * successfully processed, or < 0 on error if none were processed.
+ */
+static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
+ unsigned nr_args)
+{
+ struct io_uring_rsrc_update __user *arg = __arg;
+ struct io_uring_rsrc_update reg;
+ struct io_uring_task *tctx;
+ int ret, i;
+
+ if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
+ return -EINVAL;
+
+ mutex_unlock(&ctx->uring_lock);
+ ret = io_uring_add_tctx_node(ctx);
+ mutex_lock(&ctx->uring_lock);
+ if (ret)
+ return ret;
+
+ tctx = current->io_uring;
+ for (i = 0; i < nr_args; i++) {
+ int start, end;
+
+ if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
+ ret = -EFAULT;
+ break;
+ }
+
+ if (reg.resv) {
+ ret = -EINVAL;
+ break;
+ }
+
+ if (reg.offset == -1U) {
+ start = 0;
+ end = IO_RINGFD_REG_MAX;
+ } else {
+ if (reg.offset >= IO_RINGFD_REG_MAX) {
+ ret = -EINVAL;
+ break;
+ }
+ start = reg.offset;
+ end = start + 1;
+ }
+
+ ret = io_ring_add_registered_fd(tctx, reg.data, start, end);
+ if (ret < 0)
+ break;
+
+ reg.offset = ret;
+ if (copy_to_user(&arg[i], &reg, sizeof(reg))) {
+ fput(tctx->registered_rings[reg.offset]);
+ tctx->registered_rings[reg.offset] = NULL;
+ ret = -EFAULT;
+ break;
+ }
+ }
+
+ return i ? i : ret;
+}
+
+static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
+ unsigned nr_args)
+{
+ struct io_uring_rsrc_update __user *arg = __arg;
+ struct io_uring_task *tctx = current->io_uring;
+ struct io_uring_rsrc_update reg;
+ int ret = 0, i;
+
+ if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
+ return -EINVAL;
+ if (!tctx)
+ return 0;
+
+ for (i = 0; i < nr_args; i++) {
+ if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
+ ret = -EFAULT;
+ break;
+ }
+ if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) {
+ ret = -EINVAL;
+ break;
+ }
+
+ reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX);
+ if (tctx->registered_rings[reg.offset]) {
+ fput(tctx->registered_rings[reg.offset]);
+ tctx->registered_rings[reg.offset] = NULL;
+ }
+ }
+
+ return i ? i : ret;
+}
+
static void *io_uring_validate_mmap_request(struct file *file,
loff_t pgoff, size_t sz)
{
@@ -10095,6 +10720,8 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz
return -EINVAL;
if (copy_from_user(&arg, argp, sizeof(arg)))
return -EFAULT;
+ if (arg.pad)
+ return -EINVAL;
*sig = u64_to_user_ptr(arg.sigmask);
*argsz = arg.sigmask_sz;
*ts = u64_to_user_ptr(arg.ts);
@@ -10113,12 +10740,28 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
io_run_task_work();
if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
- IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
+ IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
+ IORING_ENTER_REGISTERED_RING)))
return -EINVAL;
- f = fdget(fd);
- if (unlikely(!f.file))
- return -EBADF;
+ /*
+ * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
+ * need only dereference our task private array to find it.
+ */
+ if (flags & IORING_ENTER_REGISTERED_RING) {
+ struct io_uring_task *tctx = current->io_uring;
+
+ if (!tctx || fd >= IO_RINGFD_REG_MAX)
+ return -EINVAL;
+ fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
+ f.file = tctx->registered_rings[fd];
+ if (unlikely(!f.file))
+ return -EBADF;
+ } else {
+ f = fdget(fd);
+ if (unlikely(!f.file))
+ return -EBADF;
+ }
ret = -EOPNOTSUPP;
if (unlikely(f.file->f_op != &io_uring_fops))
@@ -10192,7 +10835,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
out:
percpu_ref_put(&ctx->refs);
out_fput:
- fdput(f);
+ if (!(flags & IORING_ENTER_REGISTERED_RING))
+ fdput(f);
return submitted ? submitted : ret;
}
@@ -10334,7 +10978,7 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
hlist_for_each_entry(req, list, hash_node)
seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
- req->task->task_works != NULL);
+ task_work_pending(req->task));
}
seq_puts(m, "CqOverflowList:\n");
@@ -10559,7 +11203,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
- IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP;
+ IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
+ IORING_FEAT_LINKED_FILE;
if (copy_to_user(params, p, sizeof(*p))) {
ret = -EFAULT;
@@ -10610,7 +11255,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
- IORING_SETUP_R_DISABLED))
+ IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL))
return -EINVAL;
return io_uring_create(entries, &p, params);
@@ -10770,8 +11415,6 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
__u32 tmp;
int err;
- if (up->resv)
- return -EINVAL;
if (check_add_overflow(up->offset, nr_args, &tmp))
return -EOVERFLOW;
err = io_rsrc_node_switch_start(ctx);
@@ -10797,6 +11440,8 @@ static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
memset(&up, 0, sizeof(up));
if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
return -EFAULT;
+ if (up.resv || up.resv2)
+ return -EINVAL;
return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
}
@@ -10809,7 +11454,7 @@ static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
return -EINVAL;
if (copy_from_user(&up, arg, sizeof(up)))
return -EFAULT;
- if (!up.nr || up.resv)
+ if (!up.nr || up.resv || up.resv2)
return -EINVAL;
return __io_register_rsrc_update(ctx, type, &up, up.nr);
}
@@ -10857,7 +11502,15 @@ static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
if (len > cpumask_size())
len = cpumask_size();
- if (copy_from_user(new_mask, arg, len)) {
+ if (in_compat_syscall()) {
+ ret = compat_get_bitmap(cpumask_bits(new_mask),
+ (const compat_ulong_t __user *)arg,
+ len * 8 /* CHAR_BIT */);
+ } else {
+ ret = copy_from_user(new_mask, arg, len);
+ }
+
+ if (ret) {
free_cpumask_var(new_mask);
return -EFAULT;
}
@@ -10960,61 +11613,6 @@ err:
return ret;
}
-static bool io_register_op_must_quiesce(int op)
-{
- switch (op) {
- case IORING_REGISTER_BUFFERS:
- case IORING_UNREGISTER_BUFFERS:
- case IORING_REGISTER_FILES:
- case IORING_UNREGISTER_FILES:
- case IORING_REGISTER_FILES_UPDATE:
- case IORING_REGISTER_PROBE:
- case IORING_REGISTER_PERSONALITY:
- case IORING_UNREGISTER_PERSONALITY:
- case IORING_REGISTER_FILES2:
- case IORING_REGISTER_FILES_UPDATE2:
- case IORING_REGISTER_BUFFERS2:
- case IORING_REGISTER_BUFFERS_UPDATE:
- case IORING_REGISTER_IOWQ_AFF:
- case IORING_UNREGISTER_IOWQ_AFF:
- case IORING_REGISTER_IOWQ_MAX_WORKERS:
- return false;
- default:
- return true;
- }
-}
-
-static __cold int io_ctx_quiesce(struct io_ring_ctx *ctx)
-{
- long ret;
-
- percpu_ref_kill(&ctx->refs);
-
- /*
- * Drop uring mutex before waiting for references to exit. If another
- * thread is currently inside io_uring_enter() it might need to grab the
- * uring_lock to make progress. If we hold it here across the drain
- * wait, then we can deadlock. It's safe to drop the mutex here, since
- * no new references will come in after we've killed the percpu ref.
- */
- mutex_unlock(&ctx->uring_lock);
- do {
- ret = wait_for_completion_interruptible_timeout(&ctx->ref_comp, HZ);
- if (ret) {
- ret = min(0L, ret);
- break;
- }
-
- ret = io_run_task_work_sig();
- io_req_caches_free(ctx);
- } while (ret >= 0);
- mutex_lock(&ctx->uring_lock);
-
- if (ret)
- io_refs_resurrect(&ctx->refs, &ctx->ref_comp);
- return ret;
-}
-
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
void __user *arg, unsigned nr_args)
__releases(ctx->uring_lock)
@@ -11038,12 +11636,6 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
return -EACCES;
}
- if (io_register_op_must_quiesce(opcode)) {
- ret = io_ctx_quiesce(ctx);
- if (ret)
- return ret;
- }
-
switch (opcode) {
case IORING_REGISTER_BUFFERS:
ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
@@ -11067,17 +11659,16 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
ret = io_register_files_update(ctx, arg, nr_args);
break;
case IORING_REGISTER_EVENTFD:
- case IORING_REGISTER_EVENTFD_ASYNC:
ret = -EINVAL;
if (nr_args != 1)
break;
- ret = io_eventfd_register(ctx, arg);
- if (ret)
+ ret = io_eventfd_register(ctx, arg, 0);
+ break;
+ case IORING_REGISTER_EVENTFD_ASYNC:
+ ret = -EINVAL;
+ if (nr_args != 1)
break;
- if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
- ctx->eventfd_async = 1;
- else
- ctx->eventfd_async = 0;
+ ret = io_eventfd_register(ctx, arg, 1);
break;
case IORING_UNREGISTER_EVENTFD:
ret = -EINVAL;
@@ -11144,16 +11735,17 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_register_iowq_max_workers(ctx, arg);
break;
+ case IORING_REGISTER_RING_FDS:
+ ret = io_ringfd_register(ctx, arg, nr_args);
+ break;
+ case IORING_UNREGISTER_RING_FDS:
+ ret = io_ringfd_unregister(ctx, arg, nr_args);
+ break;
default:
ret = -EINVAL;
break;
}
- if (io_register_op_must_quiesce(opcode)) {
- /* bring the ctx back to life */
- percpu_ref_reinit(&ctx->refs);
- reinit_completion(&ctx->ref_comp);
- }
return ret;
}
@@ -11179,8 +11771,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
mutex_lock(&ctx->uring_lock);
ret = __io_uring_register(ctx, opcode, arg, nr_args);
mutex_unlock(&ctx->uring_lock);
- trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
- ctx->cq_ev_fd != NULL, ret);
+ trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
out_fput:
fdput(f);
return ret;
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 090bf47606ab..80ac36aea913 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -173,7 +173,7 @@ int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (*len == 0)
return -EINVAL;
- if (start > maxbytes)
+ if (start >= maxbytes)
return -EFBIG;
/*
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 6c51a75d0be6..8ce8720093b9 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -292,19 +292,20 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
if (ctx->rac) /* same as readahead_gfp_mask */
gfp |= __GFP_NORETRY | __GFP_NOWARN;
- ctx->bio = bio_alloc(gfp, bio_max_segs(nr_vecs));
+ ctx->bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs),
+ REQ_OP_READ, gfp);
/*
* If the bio_alloc fails, try it again for a single page to
* avoid having to deal with partial page reads. This emulates
* what do_mpage_readpage does.
*/
- if (!ctx->bio)
- ctx->bio = bio_alloc(orig_gfp, 1);
- ctx->bio->bi_opf = REQ_OP_READ;
+ if (!ctx->bio) {
+ ctx->bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ,
+ orig_gfp);
+ }
if (ctx->rac)
ctx->bio->bi_opf |= REQ_RAHEAD;
ctx->bio->bi_iter.bi_sector = sector;
- bio_set_dev(ctx->bio, iomap->bdev);
ctx->bio->bi_end_io = iomap_read_end_io;
bio_add_folio(ctx->bio, folio, plen, poff);
}
@@ -424,37 +425,32 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
EXPORT_SYMBOL_GPL(iomap_readahead);
/*
- * iomap_is_partially_uptodate checks whether blocks within a page are
+ * iomap_is_partially_uptodate checks whether blocks within a folio are
* uptodate or not.
*
- * Returns true if all blocks which correspond to a file portion
- * we want to read within the page are uptodate.
+ * Returns true if all blocks which correspond to the specified part
+ * of the folio are uptodate.
*/
-int
-iomap_is_partially_uptodate(struct page *page, unsigned long from,
- unsigned long count)
+bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
{
- struct folio *folio = page_folio(page);
struct iomap_page *iop = to_iomap_page(folio);
- struct inode *inode = page->mapping->host;
- unsigned len, first, last;
- unsigned i;
+ struct inode *inode = folio->mapping->host;
+ unsigned first, last, i;
- /* Limit range to one page */
- len = min_t(unsigned, PAGE_SIZE - from, count);
+ if (!iop)
+ return false;
- /* First and last blocks in range within page */
- first = from >> inode->i_blkbits;
- last = (from + len - 1) >> inode->i_blkbits;
+ /* Caller's range may extend past the end of this folio */
+ count = min(folio_size(folio) - from, count);
- if (iop) {
- for (i = first; i <= last; i++)
- if (!test_bit(i, iop->uptodate))
- return 0;
- return 1;
- }
+ /* First and last blocks in range within folio */
+ first = from >> inode->i_blkbits;
+ last = (from + count - 1) >> inode->i_blkbits;
- return 0;
+ for (i = first; i <= last; i++)
+ if (!test_bit(i, iop->uptodate))
+ return false;
+ return true;
}
EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
@@ -480,7 +476,8 @@ EXPORT_SYMBOL_GPL(iomap_releasepage);
void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
{
- trace_iomap_invalidatepage(folio->mapping->host, offset, len);
+ trace_iomap_invalidate_folio(folio->mapping->host,
+ folio_pos(folio) + offset, len);
/*
* If we're invalidating the entire folio, clear the dirty state
@@ -499,13 +496,6 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
}
EXPORT_SYMBOL_GPL(iomap_invalidate_folio);
-void iomap_invalidatepage(struct page *page, unsigned int offset,
- unsigned int len)
-{
- iomap_invalidate_folio(page_folio(page), offset, len);
-}
-EXPORT_SYMBOL_GPL(iomap_invalidatepage);
-
#ifdef CONFIG_MIGRATION
int
iomap_migrate_page(struct address_space *mapping, struct page *newpage,
@@ -550,10 +540,8 @@ static int iomap_read_folio_sync(loff_t block_start, struct folio *folio,
struct bio_vec bvec;
struct bio bio;
- bio_init(&bio, &bvec, 1);
- bio.bi_opf = REQ_OP_READ;
+ bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ);
bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
- bio_set_dev(&bio, iomap->bdev);
bio_add_folio(&bio, folio, plen, poff);
return submit_bio_wait(&bio);
}
@@ -776,7 +764,7 @@ again:
* same page as we're writing to, without it being marked
* up-to-date.
*/
- if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
+ if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
status = -EFAULT;
break;
}
@@ -1229,11 +1217,10 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
struct iomap_ioend *ioend;
struct bio *bio;
- bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &iomap_ioend_bioset);
- bio_set_dev(bio, wpc->iomap.bdev);
+ bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
+ REQ_OP_WRITE | wbc_to_write_flags(wbc),
+ GFP_NOFS, &iomap_ioend_bioset);
bio->bi_iter.bi_sector = sector;
- bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
- bio->bi_write_hint = inode->i_write_hint;
wbc_init_bio(wbc, bio);
ioend = container_of(bio, struct iomap_ioend, io_inline_bio);
@@ -1261,11 +1248,9 @@ iomap_chain_bio(struct bio *prev)
{
struct bio *new;
- new = bio_alloc(GFP_NOFS, BIO_MAX_VECS);
- bio_copy_dev(new, prev);/* also copies over blkcg information */
+ new = bio_alloc(prev->bi_bdev, BIO_MAX_VECS, prev->bi_opf, GFP_NOFS);
+ bio_clone_blkg_association(new, prev);
new->bi_iter.bi_sector = bio_end_sector(prev);
- new->bi_opf = prev->bi_opf;
- new->bi_write_hint = prev->bi_write_hint;
bio_chain(prev, new);
bio_get(prev); /* for iomap_finish_ioend */
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 03ea367df19a..b08f5dc31780 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -6,6 +6,7 @@
#include <linux/module.h>
#include <linux/compiler.h>
#include <linux/fs.h>
+#include <linux/fscrypt.h>
#include <linux/pagemap.h>
#include <linux/iomap.h>
#include <linux/backing-dev.h>
@@ -179,19 +180,20 @@ static void iomap_dio_bio_end_io(struct bio *bio)
static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
loff_t pos, unsigned len)
{
+ struct inode *inode = file_inode(dio->iocb->ki_filp);
struct page *page = ZERO_PAGE(0);
int flags = REQ_SYNC | REQ_IDLE;
struct bio *bio;
- bio = bio_alloc(GFP_KERNEL, 1);
- bio_set_dev(bio, iter->iomap.bdev);
+ bio = bio_alloc(iter->iomap.bdev, 1, REQ_OP_WRITE | flags, GFP_KERNEL);
+ fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
+ GFP_KERNEL);
bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos);
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
get_page(page);
__bio_add_page(bio, page, len, 0);
- bio_set_op_attrs(bio, REQ_OP_WRITE, flags);
iomap_dio_submit_bio(iter, dio, bio, pos);
}
@@ -309,14 +311,13 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
goto out;
}
- bio = bio_alloc(GFP_KERNEL, nr_pages);
- bio_set_dev(bio, iomap->bdev);
+ bio = bio_alloc(iomap->bdev, nr_pages, bio_opf, GFP_KERNEL);
+ fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
+ GFP_KERNEL);
bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
- bio->bi_write_hint = dio->iocb->ki_hint;
bio->bi_ioprio = dio->iocb->ki_ioprio;
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
- bio->bi_opf = bio_opf;
ret = bio_iov_iter_get_pages(bio, dio->submit.iter);
if (unlikely(ret)) {
diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c
index 66cf267c68ae..610ca6f1ec9b 100644
--- a/fs/iomap/fiemap.c
+++ b/fs/iomap/fiemap.c
@@ -7,6 +7,7 @@
#include <linux/fs.h>
#include <linux/iomap.h>
#include <linux/fiemap.h>
+#include <linux/pagemap.h>
static int iomap_to_fiemap(struct fiemap_extent_info *fi,
const struct iomap *iomap, u32 flags)
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index 65e39785c284..a6689a563c6e 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -81,7 +81,7 @@ DEFINE_EVENT(iomap_range_class, name, \
TP_ARGS(inode, off, len))
DEFINE_RANGE_EVENT(iomap_writepage);
DEFINE_RANGE_EVENT(iomap_releasepage);
-DEFINE_RANGE_EVENT(iomap_invalidatepage);
+DEFINE_RANGE_EVENT(iomap_invalidate_folio);
DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail);
#define IOMAP_TYPE_STRINGS \
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 0c6eacfcbeef..d7491692aea3 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -70,7 +70,7 @@ static struct kmem_cache *isofs_inode_cachep;
static struct inode *isofs_alloc_inode(struct super_block *sb)
{
struct iso_inode_info *ei;
- ei = kmem_cache_alloc(isofs_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, isofs_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
return &ei->vfs_inode;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 5b9408e3b370..ac7f067b7bdd 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -488,7 +488,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
jbd2_journal_wait_updates(journal);
commit_transaction->t_state = T_SWITCH;
- write_unlock(&journal->j_state_lock);
J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
journal->j_max_transaction_buffers);
@@ -508,6 +507,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
* has reserved. This is consistent with the existing behaviour
* that multiple jbd2_journal_get_write_access() calls to the same
* buffer are perfectly permissible.
+ * We use journal->j_state_lock here to serialize processing of
+ * t_reserved_list with eviction of buffers from journal_unmap_buffer().
*/
while (commit_transaction->t_reserved_list) {
jh = commit_transaction->t_reserved_list;
@@ -527,6 +528,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
jbd2_journal_refile_buffer(journal, jh);
}
+ write_unlock(&journal->j_state_lock);
/*
* Now try to drop any written-back buffers from the journal's
* checkpoint lists. We do this *before* commit because it potentially
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index c2cf74b01ddb..fcacafa4510d 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -86,7 +86,7 @@ EXPORT_SYMBOL(jbd2_journal_start_commit);
EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
EXPORT_SYMBOL(jbd2_journal_wipe);
EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
-EXPORT_SYMBOL(jbd2_journal_invalidatepage);
+EXPORT_SYMBOL(jbd2_journal_invalidate_folio);
EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
EXPORT_SYMBOL(jbd2_journal_force_commit);
EXPORT_SYMBOL(jbd2_journal_inode_ranged_write);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 8e2f8275a253..fcb9175016a5 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -107,7 +107,6 @@ static void jbd2_get_transaction(journal_t *journal,
transaction->t_start_time = ktime_get();
transaction->t_tid = journal->j_transaction_sequence++;
transaction->t_expires = jiffies + journal->j_commit_interval;
- spin_lock_init(&transaction->t_handle_lock);
atomic_set(&transaction->t_updates, 0);
atomic_set(&transaction->t_outstanding_credits,
jbd2_descriptor_blocks_per_trans(journal) +
@@ -139,26 +138,22 @@ static void jbd2_get_transaction(journal_t *journal,
/*
* Update transaction's maximum wait time, if debugging is enabled.
*
- * In order for t_max_wait to be reliable, it must be protected by a
- * lock. But doing so will mean that start_this_handle() can not be
- * run in parallel on SMP systems, which limits our scalability. So
- * unless debugging is enabled, we no longer update t_max_wait, which
- * means that maximum wait time reported by the jbd2_run_stats
- * tracepoint will always be zero.
+ * t_max_wait is carefully updated here with use of atomic compare exchange.
+ * Note that there could be multiplre threads trying to do this simultaneously
+ * hence using cmpxchg to avoid any use of locks in this case.
+ * With this t_max_wait can be updated w/o enabling jbd2_journal_enable_debug.
*/
static inline void update_t_max_wait(transaction_t *transaction,
unsigned long ts)
{
-#ifdef CONFIG_JBD2_DEBUG
- if (jbd2_journal_enable_debug &&
- time_after(transaction->t_start, ts)) {
- ts = jbd2_time_diff(ts, transaction->t_start);
- spin_lock(&transaction->t_handle_lock);
- if (ts > transaction->t_max_wait)
- transaction->t_max_wait = ts;
- spin_unlock(&transaction->t_handle_lock);
+ unsigned long oldts, newts;
+
+ if (time_after(transaction->t_start, ts)) {
+ newts = jbd2_time_diff(ts, transaction->t_start);
+ oldts = READ_ONCE(transaction->t_max_wait);
+ while (oldts < newts)
+ oldts = cmpxchg(&transaction->t_max_wait, oldts, newts);
}
-#endif
}
/*
@@ -690,7 +685,6 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
DIV_ROUND_UP(
handle->h_revoke_credits_requested,
journal->j_revoke_records_per_block);
- spin_lock(&transaction->t_handle_lock);
wanted = atomic_add_return(nblocks,
&transaction->t_outstanding_credits);
@@ -698,7 +692,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
jbd_debug(3, "denied handle %p %d blocks: "
"transaction too large\n", handle, nblocks);
atomic_sub(nblocks, &transaction->t_outstanding_credits);
- goto unlock;
+ goto error_out;
}
trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev,
@@ -714,8 +708,6 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
result = 0;
jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
-unlock:
- spin_unlock(&transaction->t_handle_lock);
error_out:
read_unlock(&journal->j_state_lock);
return result;
@@ -842,27 +834,35 @@ EXPORT_SYMBOL(jbd2_journal_restart);
*/
void jbd2_journal_wait_updates(journal_t *journal)
{
- transaction_t *commit_transaction = journal->j_running_transaction;
+ DEFINE_WAIT(wait);
- if (!commit_transaction)
- return;
+ while (1) {
+ /*
+ * Note that the running transaction can get freed under us if
+ * this transaction is getting committed in
+ * jbd2_journal_commit_transaction() ->
+ * jbd2_journal_free_transaction(). This can only happen when we
+ * release j_state_lock -> schedule() -> acquire j_state_lock.
+ * Hence we should everytime retrieve new j_running_transaction
+ * value (after j_state_lock release acquire cycle), else it may
+ * lead to use-after-free of old freed transaction.
+ */
+ transaction_t *transaction = journal->j_running_transaction;
- spin_lock(&commit_transaction->t_handle_lock);
- while (atomic_read(&commit_transaction->t_updates)) {
- DEFINE_WAIT(wait);
+ if (!transaction)
+ break;
prepare_to_wait(&journal->j_wait_updates, &wait,
- TASK_UNINTERRUPTIBLE);
- if (atomic_read(&commit_transaction->t_updates)) {
- spin_unlock(&commit_transaction->t_handle_lock);
- write_unlock(&journal->j_state_lock);
- schedule();
- write_lock(&journal->j_state_lock);
- spin_lock(&commit_transaction->t_handle_lock);
+ TASK_UNINTERRUPTIBLE);
+ if (!atomic_read(&transaction->t_updates)) {
+ finish_wait(&journal->j_wait_updates, &wait);
+ break;
}
+ write_unlock(&journal->j_state_lock);
+ schedule();
finish_wait(&journal->j_wait_updates, &wait);
+ write_lock(&journal->j_state_lock);
}
- spin_unlock(&commit_transaction->t_handle_lock);
}
/**
@@ -877,8 +877,6 @@ void jbd2_journal_wait_updates(journal_t *journal)
*/
void jbd2_journal_lock_updates(journal_t *journal)
{
- DEFINE_WAIT(wait);
-
jbd2_might_wait_for_commit(journal);
write_lock(&journal->j_state_lock);
@@ -2219,14 +2217,14 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
}
/*
- * jbd2_journal_invalidatepage
+ * jbd2_journal_invalidate_folio
*
* This code is tricky. It has a number of cases to deal with.
*
* There are two invariants which this code relies on:
*
- * i_size must be updated on disk before we start calling invalidatepage on the
- * data.
+ * i_size must be updated on disk before we start calling invalidate_folio
+ * on the data.
*
* This is done in ext3 by defining an ext3_setattr method which
* updates i_size before truncate gets going. By maintaining this
@@ -2428,9 +2426,9 @@ zap_buffer_unlocked:
}
/**
- * jbd2_journal_invalidatepage()
+ * jbd2_journal_invalidate_folio()
* @journal: journal to use for flush...
- * @page: page to flush
+ * @folio: folio to flush
* @offset: start of the range to invalidate
* @length: length of the range to invalidate
*
@@ -2439,30 +2437,29 @@ zap_buffer_unlocked:
* the page is straddling i_size. Caller then has to wait for current commit
* and try again.
*/
-int jbd2_journal_invalidatepage(journal_t *journal,
- struct page *page,
- unsigned int offset,
- unsigned int length)
+int jbd2_journal_invalidate_folio(journal_t *journal, struct folio *folio,
+ size_t offset, size_t length)
{
struct buffer_head *head, *bh, *next;
unsigned int stop = offset + length;
unsigned int curr_off = 0;
- int partial_page = (offset || length < PAGE_SIZE);
+ int partial_page = (offset || length < folio_size(folio));
int may_free = 1;
int ret = 0;
- if (!PageLocked(page))
+ if (!folio_test_locked(folio))
BUG();
- if (!page_has_buffers(page))
+ head = folio_buffers(folio);
+ if (!head)
return 0;
- BUG_ON(stop > PAGE_SIZE || stop < length);
+ BUG_ON(stop > folio_size(folio) || stop < length);
/* We will potentially be playing with lists other than just the
* data lists (especially for journaled data mode), so be
* cautious in our locking. */
- head = bh = page_buffers(page);
+ bh = head;
do {
unsigned int next_off = curr_off + bh->b_size;
next = bh->b_this_page;
@@ -2485,8 +2482,8 @@ int jbd2_journal_invalidatepage(journal_t *journal,
} while (bh != head);
if (!partial_page) {
- if (may_free && try_to_free_buffers(page))
- J_ASSERT(!page_has_buffers(page));
+ if (may_free && try_to_free_buffers(&folio->page))
+ J_ASSERT(!folio_buffers(folio));
}
return 0;
}
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index b288c8ae1236..837cd55fd4c5 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -415,13 +415,15 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c)
jffs2_free_ino_caches(c);
jffs2_free_raw_node_refs(c);
ret = -EIO;
- goto out_free;
+ goto out_sum_exit;
}
jffs2_calc_trigger_levels(c);
return 0;
+ out_sum_exit:
+ jffs2_sum_exit(c);
out_free:
kvfree(c->blocks);
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 2ac410477c4f..71f03a5d36ed 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -603,8 +603,8 @@ out_root:
jffs2_free_ino_caches(c);
jffs2_free_raw_node_refs(c);
kvfree(c->blocks);
- out_inohash:
jffs2_clear_xattr_subsystem(c);
+ out_inohash:
kfree(c->inocache_list);
out_wbuf:
jffs2_flash_cleanup(c);
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
index 2e4a86763c07..93a2951538ce 100644
--- a/fs/jffs2/jffs2_fs_i.h
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -18,11 +18,11 @@
#include <linux/mutex.h>
struct jffs2_inode_info {
- /* We need an internal mutex similar to inode->i_mutex.
+ /* We need an internal mutex similar to inode->i_rwsem.
Unfortunately, we can't used the existing one, because
either the GC would deadlock, or we'd have to release it
before letting GC proceed. Or we'd have to put ugliness
- into the GC code so it didn't attempt to obtain the i_mutex
+ into the GC code so it didn't attempt to obtain the i_rwsem
for the inode(s) which are already locked */
struct mutex sem;
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index b676056826be..29671e33a171 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -136,7 +136,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
if (!s) {
JFFS2_WARNING("Can't allocate memory for summary\n");
ret = -ENOMEM;
- goto out;
+ goto out_buf;
}
}
@@ -275,13 +275,15 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
}
ret = 0;
out:
+ jffs2_sum_reset_collected(s);
+ kfree(s);
+ out_buf:
if (buf_size)
kfree(flashbuf);
#ifndef __ECOS
else
mtd_unpoint(c->mtd, 0, c->mtd->size);
#endif
- kfree(s);
return ret;
}
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 81ca58c10b72..7ea37f49f1e1 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -39,7 +39,7 @@ static struct inode *jffs2_alloc_inode(struct super_block *sb)
{
struct jffs2_inode_info *f;
- f = kmem_cache_alloc(jffs2_inode_cachep, GFP_KERNEL);
+ f = alloc_inode_sb(sb, jffs2_inode_cachep, GFP_KERNEL);
if (!f)
return NULL;
return &f->vfs_inode;
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 57ab424c05ff..d1943a7b4b04 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -146,12 +146,13 @@ void jfs_evict_inode(struct inode *inode)
dquot_initialize(inode);
if (JFS_IP(inode)->fileset == FILESYSTEM_I) {
+ struct inode *ipimap = JFS_SBI(inode->i_sb)->ipimap;
truncate_inode_pages_final(&inode->i_data);
if (test_cflag(COMMIT_Freewmap, inode))
jfs_free_zero_link(inode);
- if (JFS_SBI(inode->i_sb)->ipimap)
+ if (ipimap && JFS_IP(ipimap)->i_imap)
diFree(inode);
/*
@@ -357,7 +358,8 @@ static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
}
const struct address_space_operations jfs_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = jfs_readpage,
.readahead = jfs_readahead,
.writepage = jfs_writepage,
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 91f4ec93dab1..d8502f4989d9 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -148,6 +148,7 @@ static const s8 budtab[256] = {
* 0 - success
* -ENOMEM - insufficient memory
* -EIO - i/o error
+ * -EINVAL - wrong bmap data
*/
int dbMount(struct inode *ipbmap)
{
@@ -179,6 +180,12 @@ int dbMount(struct inode *ipbmap)
bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree);
bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage);
bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag);
+ if (!bmp->db_numag) {
+ release_metapage(mp);
+ kfree(bmp);
+ return -EINVAL;
+ }
+
bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel);
bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag);
bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref);
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 78fd136ac13b..997c81fcea34 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1980,17 +1980,13 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
bp->l_flag |= lbmREAD;
- bio = bio_alloc(GFP_NOFS, 1);
-
+ bio = bio_alloc(log->bdev, 1, REQ_OP_READ, GFP_NOFS);
bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
- bio_set_dev(bio, log->bdev);
-
bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset);
BUG_ON(bio->bi_iter.bi_size != LOGPSIZE);
bio->bi_end_io = lbmIODone;
bio->bi_private = bp;
- bio->bi_opf = REQ_OP_READ;
/*check if journaling to disk has been disabled*/
if (log->no_integrity) {
bio->bi_iter.bi_size = 0;
@@ -2125,16 +2121,13 @@ static void lbmStartIO(struct lbuf * bp)
jfs_info("lbmStartIO");
- bio = bio_alloc(GFP_NOFS, 1);
+ bio = bio_alloc(log->bdev, 1, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS);
bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
- bio_set_dev(bio, log->bdev);
-
bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset);
BUG_ON(bio->bi_iter.bi_size != LOGPSIZE);
bio->bi_end_io = lbmIODone;
bio->bi_private = bp;
- bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
/* check if journaling to disk has been disabled */
if (log->no_integrity) {
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 104ae698443e..c4220ccdedef 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -417,12 +417,10 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
}
len = min(xlen, (int)JFS_SBI(inode->i_sb)->nbperpage);
- bio = bio_alloc(GFP_NOFS, 1);
- bio_set_dev(bio, inode->i_sb->s_bdev);
+ bio = bio_alloc(inode->i_sb->s_bdev, 1, REQ_OP_WRITE, GFP_NOFS);
bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9);
bio->bi_end_io = metapage_write_end_io;
bio->bi_private = page;
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
/* Don't call bio_add_page yet, we may add to this vec */
bio_offset = offset;
@@ -497,13 +495,12 @@ static int metapage_readpage(struct file *fp, struct page *page)
if (bio)
submit_bio(bio);
- bio = bio_alloc(GFP_NOFS, 1);
- bio_set_dev(bio, inode->i_sb->s_bdev);
+ bio = bio_alloc(inode->i_sb->s_bdev, 1, REQ_OP_READ,
+ GFP_NOFS);
bio->bi_iter.bi_sector =
pblock << (inode->i_blkbits - 9);
bio->bi_end_io = metapage_read_end_io;
bio->bi_private = page;
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
len = xlen << inode->i_blkbits;
offset = block_offset << inode->i_blkbits;
if (bio_add_page(bio, page, len, offset) < len)
@@ -555,22 +552,22 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
return ret;
}
-static void metapage_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void metapage_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- BUG_ON(offset || length < PAGE_SIZE);
+ BUG_ON(offset || length < folio_size(folio));
- BUG_ON(PageWriteback(page));
+ BUG_ON(folio_test_writeback(folio));
- metapage_releasepage(page, 0);
+ metapage_releasepage(&folio->page, 0);
}
const struct address_space_operations jfs_metapage_aops = {
.readpage = metapage_readpage,
.writepage = metapage_writepage,
.releasepage = metapage_releasepage,
- .invalidatepage = metapage_invalidatepage,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .invalidate_folio = metapage_invalidate_folio,
+ .dirty_folio = filemap_dirty_folio,
};
struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 24cbc9946e01..f1a13a74cddf 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -102,7 +102,7 @@ static struct inode *jfs_alloc_inode(struct super_block *sb)
{
struct jfs_inode_info *jfs_inode;
- jfs_inode = kmem_cache_alloc(jfs_inode_cachep, GFP_NOFS);
+ jfs_inode = alloc_inode_sb(sb, jfs_inode_cachep, GFP_NOFS);
if (!jfs_inode)
return NULL;
#ifdef CONFIG_QUOTA
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index e6d9772ddb4c..e205fde7163a 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -971,6 +971,15 @@ void kernfs_destroy_root(struct kernfs_root *root)
}
/**
+ * kernfs_root_to_node - return the kernfs_node associated with a kernfs_root
+ * @root: root to use to lookup
+ */
+struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root)
+{
+ return root->kn;
+}
+
+/**
* kernfs_create_dir_ns - create a directory
* @parent: parent in which to create a new directory
* @name: name of the new directory
@@ -1397,7 +1406,12 @@ static void __kernfs_remove(struct kernfs_node *kn)
*/
void kernfs_remove(struct kernfs_node *kn)
{
- struct kernfs_root *root = kernfs_root(kn);
+ struct kernfs_root *root;
+
+ if (!kn)
+ return;
+
+ root = kernfs_root(kn);
down_write(&root->kernfs_rwsem);
__kernfs_remove(kn);
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 9414a7a60a9f..88423069407c 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -120,13 +120,8 @@ static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
if (next == ERR_PTR(-ENODEV))
kernfs_seq_stop_active(sf, next);
return next;
- } else {
- /*
- * The same behavior and code as single_open(). Returns
- * !NULL if pos is at the beginning; otherwise, NULL.
- */
- return NULL + !*ppos;
}
+ return single_start(sf, ppos);
}
static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos)
@@ -1002,7 +997,7 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
#endif
/*
- * kn->attr.ops is accesible only while holding active ref. We
+ * kn->attr.ops is accessible only while holding active ref. We
* need to know whether some ops are implemented outside active
* ref. Cache their existence in flags.
*/
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index f9cc912c31e1..eeaa779b929c 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -31,6 +31,24 @@ struct kernfs_iattrs {
atomic_t user_xattr_size;
};
+struct kernfs_root {
+ /* published fields */
+ struct kernfs_node *kn;
+ unsigned int flags; /* KERNFS_ROOT_* flags */
+
+ /* private fields, do not use outside kernfs proper */
+ struct idr ino_idr;
+ u32 last_id_lowbits;
+ u32 id_highbits;
+ struct kernfs_syscall_ops *syscall_ops;
+
+ /* list of kernfs_super_info of this root, protected by kernfs_rwsem */
+ struct list_head supers;
+
+ wait_queue_head_t deactivate_waitq;
+ struct rw_semaphore kernfs_rwsem;
+};
+
/* +1 to avoid triggering overflow warning when negating it */
#define KN_DEACTIVATED_BIAS (INT_MIN + 1)
@@ -122,7 +140,6 @@ int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr);
/*
* dir.c
*/
-extern struct rw_semaphore kernfs_rwsem;
extern const struct dentry_operations kernfs_dops;
extern const struct file_operations kernfs_dir_fops;
extern const struct inode_operations kernfs_dir_iops;
diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h
index 71bfb7de4472..ebe6ca08467a 100644
--- a/fs/ksmbd/ksmbd_netlink.h
+++ b/fs/ksmbd/ksmbd_netlink.h
@@ -241,7 +241,7 @@ struct ksmbd_rpc_command {
struct ksmbd_spnego_authen_request {
__u32 handle;
__u16 spnego_blob_len; /* the length of spnego_blob */
- __u8 spnego_blob[0]; /*
+ __u8 spnego_blob[]; /*
* the GSS token from SecurityBuffer of
* SMB2 SESSION SETUP request
*/
diff --git a/fs/ksmbd/misc.c b/fs/ksmbd/misc.c
index 60e7ac62c917..1e2076a53bed 100644
--- a/fs/ksmbd/misc.c
+++ b/fs/ksmbd/misc.c
@@ -158,19 +158,41 @@ out:
* Return : windows path string or error
*/
-char *convert_to_nt_pathname(char *filename)
+char *convert_to_nt_pathname(struct ksmbd_share_config *share,
+ struct path *path)
{
- char *ab_pathname;
+ char *pathname, *ab_pathname, *nt_pathname;
+ int share_path_len = share->path_sz;
- if (strlen(filename) == 0)
- filename = "\\";
+ pathname = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!pathname)
+ return ERR_PTR(-EACCES);
- ab_pathname = kstrdup(filename, GFP_KERNEL);
- if (!ab_pathname)
- return NULL;
+ ab_pathname = d_path(path, pathname, PATH_MAX);
+ if (IS_ERR(ab_pathname)) {
+ nt_pathname = ERR_PTR(-EACCES);
+ goto free_pathname;
+ }
+
+ if (strncmp(ab_pathname, share->path, share_path_len)) {
+ nt_pathname = ERR_PTR(-EACCES);
+ goto free_pathname;
+ }
+
+ nt_pathname = kzalloc(strlen(&ab_pathname[share_path_len]) + 2, GFP_KERNEL);
+ if (!nt_pathname) {
+ nt_pathname = ERR_PTR(-ENOMEM);
+ goto free_pathname;
+ }
+ if (ab_pathname[share_path_len] == '\0')
+ strcpy(nt_pathname, "/");
+ strcat(nt_pathname, &ab_pathname[share_path_len]);
+
+ ksmbd_conv_path_to_windows(nt_pathname);
- ksmbd_conv_path_to_windows(ab_pathname);
- return ab_pathname;
+free_pathname:
+ kfree(pathname);
+ return nt_pathname;
}
int get_nlink(struct kstat *st)
diff --git a/fs/ksmbd/misc.h b/fs/ksmbd/misc.h
index 253366bd0951..aae2a252945f 100644
--- a/fs/ksmbd/misc.h
+++ b/fs/ksmbd/misc.h
@@ -14,7 +14,8 @@ struct ksmbd_file;
int match_pattern(const char *str, size_t len, const char *pattern);
int ksmbd_validate_filename(char *filename);
int parse_stream_name(char *filename, char **stream_name, int *s_type);
-char *convert_to_nt_pathname(char *filename);
+char *convert_to_nt_pathname(struct ksmbd_share_config *share,
+ struct path *path);
int get_nlink(struct kstat *st);
void ksmbd_conv_path_to_unix(char *path);
void ksmbd_strip_last_slash(char *path);
diff --git a/fs/ksmbd/ntlmssp.h b/fs/ksmbd/ntlmssp.h
index adaf4c0cbe8f..f13153c18b4e 100644
--- a/fs/ksmbd/ntlmssp.h
+++ b/fs/ksmbd/ntlmssp.h
@@ -95,7 +95,7 @@ struct security_buffer {
struct target_info {
__le16 Type;
__le16 Length;
- __u8 Content[0];
+ __u8 Content[];
} __packed;
struct negotiate_message {
@@ -108,7 +108,7 @@ struct negotiate_message {
* struct security_buffer for version info not present since we
* do not set the version is present flag
*/
- char DomainString[0];
+ char DomainString[];
/* followed by WorkstationString */
} __packed;
@@ -140,7 +140,7 @@ struct authenticate_message {
* struct security_buffer for version info not present since we
* do not set the version is present flag
*/
- char UserString[0];
+ char UserString[];
} __packed;
struct ntlmv2_resp {
diff --git a/fs/ksmbd/oplock.c b/fs/ksmbd/oplock.c
index 077b8761d099..8b5560574d4c 100644
--- a/fs/ksmbd/oplock.c
+++ b/fs/ksmbd/oplock.c
@@ -656,8 +656,8 @@ static void __smb2_oplock_break_noti(struct work_struct *wk)
rsp->OplockLevel = SMB2_OPLOCK_LEVEL_NONE;
rsp->Reserved = 0;
rsp->Reserved2 = 0;
- rsp->PersistentFid = cpu_to_le64(fp->persistent_id);
- rsp->VolatileFid = cpu_to_le64(fp->volatile_id);
+ rsp->PersistentFid = fp->persistent_id;
+ rsp->VolatileFid = fp->volatile_id;
inc_rfc1001_len(work->response_buf, 24);
@@ -1694,33 +1694,3 @@ out:
read_unlock(&lease_list_lock);
return ret_op;
}
-
-int smb2_check_durable_oplock(struct ksmbd_file *fp,
- struct lease_ctx_info *lctx, char *name)
-{
- struct oplock_info *opinfo = opinfo_get(fp);
- int ret = 0;
-
- if (opinfo && opinfo->is_lease) {
- if (!lctx) {
- pr_err("open does not include lease\n");
- ret = -EBADF;
- goto out;
- }
- if (memcmp(opinfo->o_lease->lease_key, lctx->lease_key,
- SMB2_LEASE_KEY_SIZE)) {
- pr_err("invalid lease key\n");
- ret = -EBADF;
- goto out;
- }
- if (name && strcmp(fp->filename, name)) {
- pr_err("invalid name reconnect %s\n", name);
- ret = -EINVAL;
- goto out;
- }
- }
-out:
- if (opinfo)
- opinfo_put(opinfo);
- return ret;
-}
diff --git a/fs/ksmbd/oplock.h b/fs/ksmbd/oplock.h
index 0cf7a2b5bbc0..09753448f779 100644
--- a/fs/ksmbd/oplock.h
+++ b/fs/ksmbd/oplock.h
@@ -124,6 +124,4 @@ struct oplock_info *lookup_lease_in_table(struct ksmbd_conn *conn,
int find_same_lease_key(struct ksmbd_session *sess, struct ksmbd_inode *ci,
struct lease_ctx_info *lctx);
void destroy_lease_table(struct ksmbd_conn *conn);
-int smb2_check_durable_oplock(struct ksmbd_file *fp,
- struct lease_ctx_info *lctx, char *name);
#endif /* __KSMBD_OPLOCK_H */
diff --git a/fs/ksmbd/server.c b/fs/ksmbd/server.c
index 2e12f6d8483b..4cd03d661df0 100644
--- a/fs/ksmbd/server.c
+++ b/fs/ksmbd/server.c
@@ -585,7 +585,7 @@ static int __init ksmbd_server_init(void)
if (ret)
goto err_crypto_destroy;
- pr_warn_once("The ksmbd server is experimental, use at your own risk.\n");
+ pr_warn_once("The ksmbd server is experimental\n");
return 0;
diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
index 67e8e28e3fc3..16c803a9d996 100644
--- a/fs/ksmbd/smb2pdu.c
+++ b/fs/ksmbd/smb2pdu.c
@@ -11,6 +11,7 @@
#include <linux/statfs.h>
#include <linux/ethtool.h>
#include <linux/falloc.h>
+#include <linux/mount.h>
#include "glob.h"
#include "smbfsctl.h"
@@ -377,12 +378,8 @@ static void init_chained_smb2_rsp(struct ksmbd_work *work)
* command in the compound request
*/
if (req->Command == SMB2_CREATE && rsp->Status == STATUS_SUCCESS) {
- work->compound_fid =
- le64_to_cpu(((struct smb2_create_rsp *)rsp)->
- VolatileFileId);
- work->compound_pfid =
- le64_to_cpu(((struct smb2_create_rsp *)rsp)->
- PersistentFileId);
+ work->compound_fid = ((struct smb2_create_rsp *)rsp)->VolatileFileId;
+ work->compound_pfid = ((struct smb2_create_rsp *)rsp)->PersistentFileId;
work->compound_sid = le64_to_cpu(rsp->SessionId);
}
@@ -2129,7 +2126,7 @@ static noinline int create_smb2_pipe(struct ksmbd_work *work)
rsp->EndofFile = cpu_to_le64(0);
rsp->FileAttributes = FILE_ATTRIBUTE_NORMAL_LE;
rsp->Reserved2 = 0;
- rsp->VolatileFileId = cpu_to_le64(id);
+ rsp->VolatileFileId = id;
rsp->PersistentFileId = 0;
rsp->CreateContextsOffset = 0;
rsp->CreateContextsLength = 0;
@@ -2922,7 +2919,6 @@ int smb2_open(struct ksmbd_work *work)
goto err_out;
}
- fp->filename = name;
fp->cdoption = req->CreateDisposition;
fp->daccess = daccess;
fp->saccess = req->ShareAccess;
@@ -3157,8 +3153,8 @@ int smb2_open(struct ksmbd_work *work)
rsp->Reserved2 = 0;
- rsp->PersistentFileId = cpu_to_le64(fp->persistent_id);
- rsp->VolatileFileId = cpu_to_le64(fp->volatile_id);
+ rsp->PersistentFileId = fp->persistent_id;
+ rsp->VolatileFileId = fp->volatile_id;
rsp->CreateContextsOffset = 0;
rsp->CreateContextsLength = 0;
@@ -3274,14 +3270,13 @@ err_out1:
if (!rsp->hdr.Status)
rsp->hdr.Status = STATUS_UNEXPECTED_IO_ERROR;
- if (!fp || !fp->filename)
- kfree(name);
if (fp)
ksmbd_fd_put(work, fp);
smb2_set_err_rsp(work);
ksmbd_debug(SMB, "Error response: %x\n", rsp->hdr.Status);
}
+ kfree(name);
kfree(lc);
return 0;
@@ -3865,9 +3860,7 @@ int smb2_query_dir(struct ksmbd_work *work)
goto err_out2;
}
- dir_fp = ksmbd_lookup_fd_slow(work,
- le64_to_cpu(req->VolatileFileId),
- le64_to_cpu(req->PersistentFileId));
+ dir_fp = ksmbd_lookup_fd_slow(work, req->VolatileFileId, req->PersistentFileId);
if (!dir_fp) {
rc = -EBADF;
goto err_out2;
@@ -3901,8 +3894,6 @@ int smb2_query_dir(struct ksmbd_work *work)
ksmbd_debug(SMB, "Search pattern is %s\n", srch_ptr);
}
- ksmbd_debug(SMB, "Directory name is %s\n", dir_fp->filename);
-
if (srch_flag & SMB2_REOPEN || srch_flag & SMB2_RESTART_SCANS) {
ksmbd_debug(SMB, "Restart directory scan\n");
generic_file_llseek(dir_fp->filp, 0, SEEK_SET);
@@ -4088,12 +4079,12 @@ static int smb2_get_info_file_pipe(struct ksmbd_session *sess,
* Windows can sometime send query file info request on
* pipe without opening it, checking error condition here
*/
- id = le64_to_cpu(req->VolatileFileId);
+ id = req->VolatileFileId;
if (!ksmbd_session_rpc_method(sess, id))
return -ENOENT;
ksmbd_debug(SMB, "FileInfoClass %u, FileId 0x%llx\n",
- req->FileInfoClass, le64_to_cpu(req->VolatileFileId));
+ req->FileInfoClass, req->VolatileFileId);
switch (req->FileInfoClass) {
case FILE_STANDARD_INFORMATION:
@@ -4396,9 +4387,9 @@ static int get_file_all_info(struct ksmbd_work *work,
return -EACCES;
}
- filename = convert_to_nt_pathname(fp->filename);
- if (!filename)
- return -ENOMEM;
+ filename = convert_to_nt_pathname(work->tcon->share_conf, &fp->filp->f_path);
+ if (IS_ERR(filename))
+ return PTR_ERR(filename);
inode = file_inode(fp->filp);
generic_fillattr(file_mnt_user_ns(fp->filp), inode, &stat);
@@ -4738,7 +4729,7 @@ static int smb2_get_info_file(struct ksmbd_work *work,
}
if (work->next_smb2_rcv_hdr_off) {
- if (!has_file_id(le64_to_cpu(req->VolatileFileId))) {
+ if (!has_file_id(req->VolatileFileId)) {
ksmbd_debug(SMB, "Compound request set FID = %llu\n",
work->compound_fid);
id = work->compound_fid;
@@ -4747,8 +4738,8 @@ static int smb2_get_info_file(struct ksmbd_work *work,
}
if (!has_file_id(id)) {
- id = le64_to_cpu(req->VolatileFileId);
- pid = le64_to_cpu(req->PersistentFileId);
+ id = req->VolatileFileId;
+ pid = req->PersistentFileId;
}
fp = ksmbd_lookup_fd_slow(work, id, pid);
@@ -5005,15 +4996,17 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
case FS_SECTOR_SIZE_INFORMATION:
{
struct smb3_fs_ss_info *info;
+ unsigned int sector_size =
+ min_t(unsigned int, path.mnt->mnt_sb->s_blocksize, 4096);
info = (struct smb3_fs_ss_info *)(rsp->Buffer);
- info->LogicalBytesPerSector = cpu_to_le32(stfs.f_bsize);
+ info->LogicalBytesPerSector = cpu_to_le32(sector_size);
info->PhysicalBytesPerSectorForAtomicity =
- cpu_to_le32(stfs.f_bsize);
- info->PhysicalBytesPerSectorForPerf = cpu_to_le32(stfs.f_bsize);
+ cpu_to_le32(sector_size);
+ info->PhysicalBytesPerSectorForPerf = cpu_to_le32(sector_size);
info->FSEffPhysicalBytesPerSectorForAtomicity =
- cpu_to_le32(stfs.f_bsize);
+ cpu_to_le32(sector_size);
info->Flags = cpu_to_le32(SSINFO_FLAGS_ALIGNED_DEVICE |
SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE);
info->ByteOffsetForSectorAlignment = 0;
@@ -5113,7 +5106,7 @@ static int smb2_get_info_sec(struct ksmbd_work *work,
}
if (work->next_smb2_rcv_hdr_off) {
- if (!has_file_id(le64_to_cpu(req->VolatileFileId))) {
+ if (!has_file_id(req->VolatileFileId)) {
ksmbd_debug(SMB, "Compound request set FID = %llu\n",
work->compound_fid);
id = work->compound_fid;
@@ -5122,8 +5115,8 @@ static int smb2_get_info_sec(struct ksmbd_work *work,
}
if (!has_file_id(id)) {
- id = le64_to_cpu(req->VolatileFileId);
- pid = le64_to_cpu(req->PersistentFileId);
+ id = req->VolatileFileId;
+ pid = req->PersistentFileId;
}
fp = ksmbd_lookup_fd_slow(work, id, pid);
@@ -5221,7 +5214,7 @@ static noinline int smb2_close_pipe(struct ksmbd_work *work)
struct smb2_close_req *req = smb2_get_msg(work->request_buf);
struct smb2_close_rsp *rsp = smb2_get_msg(work->response_buf);
- id = le64_to_cpu(req->VolatileFileId);
+ id = req->VolatileFileId;
ksmbd_session_rpc_close(work->sess, id);
rsp->StructureSize = cpu_to_le16(60);
@@ -5280,7 +5273,7 @@ int smb2_close(struct ksmbd_work *work)
}
if (work->next_smb2_rcv_hdr_off &&
- !has_file_id(le64_to_cpu(req->VolatileFileId))) {
+ !has_file_id(req->VolatileFileId)) {
if (!has_file_id(work->compound_fid)) {
/* file already closed, return FILE_CLOSED */
ksmbd_debug(SMB, "file already closed\n");
@@ -5299,7 +5292,7 @@ int smb2_close(struct ksmbd_work *work)
work->compound_pfid = KSMBD_NO_FID;
}
} else {
- volatile_id = le64_to_cpu(req->VolatileFileId);
+ volatile_id = req->VolatileFileId;
}
ksmbd_debug(SMB, "volatile_id = %llu\n", volatile_id);
@@ -5689,8 +5682,7 @@ static int set_file_allocation_info(struct ksmbd_work *work,
size = i_size_read(inode);
rc = ksmbd_vfs_truncate(work, fp, alloc_blks * 512);
if (rc) {
- pr_err("truncate failed! filename : %s, err %d\n",
- fp->filename, rc);
+ pr_err("truncate failed!, err %d\n", rc);
return rc;
}
if (size < alloc_blks * 512)
@@ -5720,12 +5712,10 @@ static int set_end_of_file_info(struct ksmbd_work *work, struct ksmbd_file *fp,
* truncated range.
*/
if (inode->i_sb->s_magic != MSDOS_SUPER_MAGIC) {
- ksmbd_debug(SMB, "filename : %s truncated to newsize %lld\n",
- fp->filename, newsize);
+ ksmbd_debug(SMB, "truncated to newsize %lld\n", newsize);
rc = ksmbd_vfs_truncate(work, fp, newsize);
if (rc) {
- ksmbd_debug(SMB, "truncate failed! filename : %s err %d\n",
- fp->filename, rc);
+ ksmbd_debug(SMB, "truncate failed!, err %d\n", rc);
if (rc != -EAGAIN)
rc = -EBADF;
return rc;
@@ -5771,8 +5761,10 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp,
if (parent_fp) {
if (parent_fp->daccess & FILE_DELETE_LE) {
pr_err("parent dir is opened with delete access\n");
+ ksmbd_fd_put(work, parent_fp);
return -ESHARE;
}
+ ksmbd_fd_put(work, parent_fp);
}
next:
return smb2_rename(work, fp, user_ns, rename_info,
@@ -5988,7 +5980,7 @@ int smb2_set_info(struct ksmbd_work *work)
if (work->next_smb2_rcv_hdr_off) {
req = ksmbd_req_buf_next(work);
rsp = ksmbd_resp_buf_next(work);
- if (!has_file_id(le64_to_cpu(req->VolatileFileId))) {
+ if (!has_file_id(req->VolatileFileId)) {
ksmbd_debug(SMB, "Compound request set FID = %llu\n",
work->compound_fid);
id = work->compound_fid;
@@ -6000,8 +5992,8 @@ int smb2_set_info(struct ksmbd_work *work)
}
if (!has_file_id(id)) {
- id = le64_to_cpu(req->VolatileFileId);
- pid = le64_to_cpu(req->PersistentFileId);
+ id = req->VolatileFileId;
+ pid = req->PersistentFileId;
}
fp = ksmbd_lookup_fd_slow(work, id, pid);
@@ -6079,7 +6071,7 @@ static noinline int smb2_read_pipe(struct ksmbd_work *work)
struct smb2_read_req *req = smb2_get_msg(work->request_buf);
struct smb2_read_rsp *rsp = smb2_get_msg(work->response_buf);
- id = le64_to_cpu(req->VolatileFileId);
+ id = req->VolatileFileId;
inc_rfc1001_len(work->response_buf, 16);
rpc_resp = ksmbd_rpc_read(work->sess, id);
@@ -6215,8 +6207,7 @@ int smb2_read(struct ksmbd_work *work)
goto out;
}
- fp = ksmbd_lookup_fd_slow(work, le64_to_cpu(req->VolatileFileId),
- le64_to_cpu(req->PersistentFileId));
+ fp = ksmbd_lookup_fd_slow(work, req->VolatileFileId, req->PersistentFileId);
if (!fp) {
err = -ENOENT;
goto out;
@@ -6335,7 +6326,7 @@ static noinline int smb2_write_pipe(struct ksmbd_work *work)
size_t length;
length = le32_to_cpu(req->Length);
- id = le64_to_cpu(req->VolatileFileId);
+ id = req->VolatileFileId;
if (le16_to_cpu(req->DataOffset) ==
offsetof(struct smb2_write_req, Buffer)) {
@@ -6471,8 +6462,7 @@ int smb2_write(struct ksmbd_work *work)
goto out;
}
- fp = ksmbd_lookup_fd_slow(work, le64_to_cpu(req->VolatileFileId),
- le64_to_cpu(req->PersistentFileId));
+ fp = ksmbd_lookup_fd_slow(work, req->VolatileFileId, req->PersistentFileId);
if (!fp) {
err = -ENOENT;
goto out;
@@ -6584,12 +6574,9 @@ int smb2_flush(struct ksmbd_work *work)
WORK_BUFFERS(work, req, rsp);
- ksmbd_debug(SMB, "SMB2_FLUSH called for fid %llu\n",
- le64_to_cpu(req->VolatileFileId));
+ ksmbd_debug(SMB, "SMB2_FLUSH called for fid %llu\n", req->VolatileFileId);
- err = ksmbd_vfs_fsync(work,
- le64_to_cpu(req->VolatileFileId),
- le64_to_cpu(req->PersistentFileId));
+ err = ksmbd_vfs_fsync(work, req->VolatileFileId, req->PersistentFileId);
if (err)
goto out;
@@ -6618,8 +6605,7 @@ int smb2_cancel(struct ksmbd_work *work)
struct ksmbd_conn *conn = work->conn;
struct smb2_hdr *hdr = smb2_get_msg(work->request_buf);
struct smb2_hdr *chdr;
- struct ksmbd_work *cancel_work = NULL;
- int canceled = 0;
+ struct ksmbd_work *cancel_work = NULL, *iter;
struct list_head *command_list;
ksmbd_debug(SMB, "smb2 cancel called on mid %llu, async flags 0x%x\n",
@@ -6629,11 +6615,11 @@ int smb2_cancel(struct ksmbd_work *work)
command_list = &conn->async_requests;
spin_lock(&conn->request_lock);
- list_for_each_entry(cancel_work, command_list,
+ list_for_each_entry(iter, command_list,
async_request_entry) {
- chdr = smb2_get_msg(cancel_work->request_buf);
+ chdr = smb2_get_msg(iter->request_buf);
- if (cancel_work->async_id !=
+ if (iter->async_id !=
le64_to_cpu(hdr->Id.AsyncId))
continue;
@@ -6641,7 +6627,7 @@ int smb2_cancel(struct ksmbd_work *work)
"smb2 with AsyncId %llu cancelled command = 0x%x\n",
le64_to_cpu(hdr->Id.AsyncId),
le16_to_cpu(chdr->Command));
- canceled = 1;
+ cancel_work = iter;
break;
}
spin_unlock(&conn->request_lock);
@@ -6649,24 +6635,24 @@ int smb2_cancel(struct ksmbd_work *work)
command_list = &conn->requests;
spin_lock(&conn->request_lock);
- list_for_each_entry(cancel_work, command_list, request_entry) {
- chdr = smb2_get_msg(cancel_work->request_buf);
+ list_for_each_entry(iter, command_list, request_entry) {
+ chdr = smb2_get_msg(iter->request_buf);
if (chdr->MessageId != hdr->MessageId ||
- cancel_work == work)
+ iter == work)
continue;
ksmbd_debug(SMB,
"smb2 with mid %llu cancelled command = 0x%x\n",
le64_to_cpu(hdr->MessageId),
le16_to_cpu(chdr->Command));
- canceled = 1;
+ cancel_work = iter;
break;
}
spin_unlock(&conn->request_lock);
}
- if (canceled) {
+ if (cancel_work) {
cancel_work->state = KSMBD_WORK_CANCELLED;
if (cancel_work->cancel_fn)
cancel_work->cancel_fn(cancel_work->cancel_argv);
@@ -6804,12 +6790,9 @@ int smb2_lock(struct ksmbd_work *work)
int prior_lock = 0;
ksmbd_debug(SMB, "Received lock request\n");
- fp = ksmbd_lookup_fd_slow(work,
- le64_to_cpu(req->VolatileFileId),
- le64_to_cpu(req->PersistentFileId));
+ fp = ksmbd_lookup_fd_slow(work, req->VolatileFileId, req->PersistentFileId);
if (!fp) {
- ksmbd_debug(SMB, "Invalid file id for lock : %llu\n",
- le64_to_cpu(req->VolatileFileId));
+ ksmbd_debug(SMB, "Invalid file id for lock : %llu\n", req->VolatileFileId);
err = -ENOENT;
goto out2;
}
@@ -7164,8 +7147,8 @@ static int fsctl_copychunk(struct ksmbd_work *work,
ci_rsp = (struct copychunk_ioctl_rsp *)&rsp->Buffer[0];
- rsp->VolatileFileId = cpu_to_le64(volatile_id);
- rsp->PersistentFileId = cpu_to_le64(persistent_id);
+ rsp->VolatileFileId = volatile_id;
+ rsp->PersistentFileId = persistent_id;
ci_rsp->ChunksWritten =
cpu_to_le32(ksmbd_server_side_copy_max_chunk_count());
ci_rsp->ChunkBytesWritten =
@@ -7379,8 +7362,8 @@ ipv6_retry:
if (nii_rsp)
nii_rsp->Next = 0;
- rsp->PersistentFileId = cpu_to_le64(SMB2_NO_FID);
- rsp->VolatileFileId = cpu_to_le64(SMB2_NO_FID);
+ rsp->PersistentFileId = SMB2_NO_FID;
+ rsp->VolatileFileId = SMB2_NO_FID;
return nbytes;
}
@@ -7547,9 +7530,7 @@ static int fsctl_request_resume_key(struct ksmbd_work *work,
{
struct ksmbd_file *fp;
- fp = ksmbd_lookup_fd_slow(work,
- le64_to_cpu(req->VolatileFileId),
- le64_to_cpu(req->PersistentFileId));
+ fp = ksmbd_lookup_fd_slow(work, req->VolatileFileId, req->PersistentFileId);
if (!fp)
return -ENOENT;
@@ -7579,7 +7560,7 @@ int smb2_ioctl(struct ksmbd_work *work)
if (work->next_smb2_rcv_hdr_off) {
req = ksmbd_req_buf_next(work);
rsp = ksmbd_resp_buf_next(work);
- if (!has_file_id(le64_to_cpu(req->VolatileFileId))) {
+ if (!has_file_id(req->VolatileFileId)) {
ksmbd_debug(SMB, "Compound request set FID = %llu\n",
work->compound_fid);
id = work->compound_fid;
@@ -7590,14 +7571,14 @@ int smb2_ioctl(struct ksmbd_work *work)
}
if (!has_file_id(id))
- id = le64_to_cpu(req->VolatileFileId);
+ id = req->VolatileFileId;
if (req->Flags != cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL)) {
rsp->hdr.Status = STATUS_NOT_SUPPORTED;
goto out;
}
- cnt_code = le32_to_cpu(req->CntCode);
+ cnt_code = le32_to_cpu(req->CtlCode);
ret = smb2_calc_max_out_buf_len(work, 48,
le32_to_cpu(req->MaxOutputResponse));
if (ret < 0) {
@@ -7656,8 +7637,8 @@ int smb2_ioctl(struct ksmbd_work *work)
goto out;
nbytes = sizeof(struct validate_negotiate_info_rsp);
- rsp->PersistentFileId = cpu_to_le64(SMB2_NO_FID);
- rsp->VolatileFileId = cpu_to_le64(SMB2_NO_FID);
+ rsp->PersistentFileId = SMB2_NO_FID;
+ rsp->VolatileFileId = SMB2_NO_FID;
break;
case FSCTL_QUERY_NETWORK_INTERFACE_INFO:
ret = fsctl_query_iface_info_ioctl(conn, rsp, out_buf_len);
@@ -7703,10 +7684,10 @@ int smb2_ioctl(struct ksmbd_work *work)
rsp->PersistentFileId = req->PersistentFileId;
fsctl_copychunk(work,
(struct copychunk_ioctl_req *)&req->Buffer[0],
- le32_to_cpu(req->CntCode),
+ le32_to_cpu(req->CtlCode),
le32_to_cpu(req->InputCount),
- le64_to_cpu(req->VolatileFileId),
- le64_to_cpu(req->PersistentFileId),
+ req->VolatileFileId,
+ req->PersistentFileId,
rsp);
break;
case FSCTL_SET_SPARSE:
@@ -7857,7 +7838,7 @@ dup_ext_out:
goto out;
}
- rsp->CntCode = cpu_to_le32(cnt_code);
+ rsp->CtlCode = cpu_to_le32(cnt_code);
rsp->InputCount = cpu_to_le32(0);
rsp->InputOffset = cpu_to_le32(112);
rsp->OutputOffset = cpu_to_le32(112);
@@ -7903,8 +7884,8 @@ static void smb20_oplock_break_ack(struct ksmbd_work *work)
char req_oplevel = 0, rsp_oplevel = 0;
unsigned int oplock_change_type;
- volatile_id = le64_to_cpu(req->VolatileFid);
- persistent_id = le64_to_cpu(req->PersistentFid);
+ volatile_id = req->VolatileFid;
+ persistent_id = req->PersistentFid;
req_oplevel = req->OplockLevel;
ksmbd_debug(OPLOCK, "v_id %llu, p_id %llu request oplock level %d\n",
volatile_id, persistent_id, req_oplevel);
@@ -7999,8 +7980,8 @@ static void smb20_oplock_break_ack(struct ksmbd_work *work)
rsp->OplockLevel = rsp_oplevel;
rsp->Reserved = 0;
rsp->Reserved2 = 0;
- rsp->VolatileFid = cpu_to_le64(volatile_id);
- rsp->PersistentFid = cpu_to_le64(persistent_id);
+ rsp->VolatileFid = volatile_id;
+ rsp->PersistentFid = persistent_id;
inc_rfc1001_len(work->response_buf, 24);
return;
@@ -8500,7 +8481,7 @@ static void fill_transform_hdr(void *tr_buf, char *old_buf, __le16 cipher_type)
struct smb2_hdr *hdr = smb2_get_msg(old_buf);
unsigned int orig_len = get_rfc1002_len(old_buf);
- memset(tr_buf, 0, sizeof(struct smb2_transform_hdr) + 4);
+ /* tr_buf must be cleared by the caller */
tr_hdr->ProtocolId = SMB2_TRANSFORM_PROTO_NUM;
tr_hdr->OriginalMessageSize = cpu_to_le32(orig_len);
tr_hdr->Flags = cpu_to_le16(TRANSFORM_FLAG_ENCRYPTED);
diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h
index 725b800c29c8..af455278d005 100644
--- a/fs/ksmbd/smb2pdu.h
+++ b/fs/ksmbd/smb2pdu.h
@@ -16,42 +16,13 @@
#define FILE_CREATED 0x00000002
#define FILE_OVERWRITTEN 0x00000003
-/*
- * Size of the session key (crypto key encrypted with the password
- */
-#define SMB2_NTLMV2_SESSKEY_SIZE 16
-#define SMB2_SIGNATURE_SIZE 16
-#define SMB2_HMACSHA256_SIZE 32
-#define SMB2_CMACAES_SIZE 16
-#define SMB3_GCM128_CRYPTKEY_SIZE 16
-#define SMB3_GCM256_CRYPTKEY_SIZE 32
-
-/*
- * Size of the smb3 encryption/decryption keys
- */
-#define SMB3_ENC_DEC_KEY_SIZE 32
-
-/*
- * Size of the smb3 signing key
- */
-#define SMB3_SIGN_KEY_SIZE 16
-
-#define CIFS_CLIENT_CHALLENGE_SIZE 8
-#define SMB_SERVER_CHALLENGE_SIZE 8
-
/* SMB2 Max Credits */
#define SMB2_MAX_CREDITS 8192
-/* Maximum buffer size value we can send with 1 credit */
-#define SMB2_MAX_BUFFER_SIZE 65536
-
-#define NUMBER_OF_SMB2_COMMANDS 0x0013
-
/* BB FIXME - analyze following length BB */
#define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */
#define SMB21_DEFAULT_IOSIZE (1024 * 1024)
-#define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024)
#define SMB3_DEFAULT_TRANS_SIZE (1024 * 1024)
#define SMB3_MIN_IOSIZE (64 * 1024)
#define SMB3_MAX_IOSIZE (8 * 1024 * 1024)
@@ -65,18 +36,6 @@
*
*/
-#define SMB2_ERROR_STRUCTURE_SIZE2 9
-#define SMB2_ERROR_STRUCTURE_SIZE2_LE cpu_to_le16(SMB2_ERROR_STRUCTURE_SIZE2)
-
-struct smb2_err_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize;
- __u8 ErrorContextCount;
- __u8 Reserved;
- __le32 ByteCount; /* even if zero, at least one byte follows */
- __u8 ErrorData[1]; /* variable length */
-} __packed;
-
struct preauth_integrity_info {
/* PreAuth integrity Hash ID */
__le16 Preauth_HashId;
@@ -116,8 +75,8 @@ struct create_durable_reconn_req {
union {
__u8 Reserved[16];
struct {
- __le64 PersistentFileId;
- __le64 VolatileFileId;
+ __u64 PersistentFileId;
+ __u64 VolatileFileId;
} Fid;
} Data;
} __packed;
@@ -126,8 +85,8 @@ struct create_durable_reconn_v2_req {
struct create_context ccontext;
__u8 Name[8];
struct {
- __le64 PersistentFileId;
- __le64 VolatileFileId;
+ __u64 PersistentFileId;
+ __u64 VolatileFileId;
} Fid;
__u8 CreateGuid[16];
__le32 Flags;
@@ -161,13 +120,6 @@ struct create_alloc_size_req {
__le64 AllocationSize;
} __packed;
-struct create_posix {
- struct create_context ccontext;
- __u8 Name[16];
- __le32 Mode;
- __u32 Reserved;
-} __packed;
-
struct create_durable_rsp {
struct create_context ccontext;
__u8 Name[8];
@@ -209,45 +161,6 @@ struct create_posix_rsp {
u8 SidBuffer[40];
} __packed;
-#define SMB2_LEASE_NONE_LE cpu_to_le32(0x00)
-#define SMB2_LEASE_READ_CACHING_LE cpu_to_le32(0x01)
-#define SMB2_LEASE_HANDLE_CACHING_LE cpu_to_le32(0x02)
-#define SMB2_LEASE_WRITE_CACHING_LE cpu_to_le32(0x04)
-
-#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS_LE cpu_to_le32(0x02)
-
-#define SMB2_LEASE_KEY_SIZE 16
-
-struct lease_context {
- __u8 LeaseKey[SMB2_LEASE_KEY_SIZE];
- __le32 LeaseState;
- __le32 LeaseFlags;
- __le64 LeaseDuration;
-} __packed;
-
-struct lease_context_v2 {
- __u8 LeaseKey[SMB2_LEASE_KEY_SIZE];
- __le32 LeaseState;
- __le32 LeaseFlags;
- __le64 LeaseDuration;
- __u8 ParentLeaseKey[SMB2_LEASE_KEY_SIZE];
- __le16 Epoch;
- __le16 Reserved;
-} __packed;
-
-struct create_lease {
- struct create_context ccontext;
- __u8 Name[8];
- struct lease_context lcontext;
-} __packed;
-
-struct create_lease_v2 {
- struct create_context ccontext;
- __u8 Name[8];
- struct lease_context_v2 lcontext;
- __u8 Pad[4];
-} __packed;
-
struct smb2_buffer_desc_v1 {
__le64 offset;
__le32 token;
@@ -256,63 +169,6 @@ struct smb2_buffer_desc_v1 {
#define SMB2_0_IOCTL_IS_FSCTL 0x00000001
-struct duplicate_extents_to_file {
- __u64 PersistentFileHandle; /* source file handle, opaque endianness */
- __u64 VolatileFileHandle;
- __le64 SourceFileOffset;
- __le64 TargetFileOffset;
- __le64 ByteCount; /* Bytes to be copied */
-} __packed;
-
-struct smb2_ioctl_req {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 57 */
- __le16 Reserved; /* offset from start of SMB2 header to write data */
- __le32 CntCode;
- __le64 PersistentFileId;
- __le64 VolatileFileId;
- __le32 InputOffset; /* Reserved MBZ */
- __le32 InputCount;
- __le32 MaxInputResponse;
- __le32 OutputOffset;
- __le32 OutputCount;
- __le32 MaxOutputResponse;
- __le32 Flags;
- __le32 Reserved2;
- __u8 Buffer[1];
-} __packed;
-
-struct smb2_ioctl_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 49 */
- __le16 Reserved; /* offset from start of SMB2 header to write data */
- __le32 CntCode;
- __le64 PersistentFileId;
- __le64 VolatileFileId;
- __le32 InputOffset; /* Reserved MBZ */
- __le32 InputCount;
- __le32 OutputOffset;
- __le32 OutputCount;
- __le32 Flags;
- __le32 Reserved2;
- __u8 Buffer[1];
-} __packed;
-
-struct validate_negotiate_info_req {
- __le32 Capabilities;
- __u8 Guid[SMB2_CLIENT_GUID_SIZE];
- __le16 SecurityMode;
- __le16 DialectCount;
- __le16 Dialects[1]; /* dialect (someday maybe list) client asked for */
-} __packed;
-
-struct validate_negotiate_info_rsp {
- __le32 Capabilities;
- __u8 Guid[SMB2_CLIENT_GUID_SIZE];
- __le16 SecurityMode;
- __le16 Dialect; /* Dialect in use for the connection */
-} __packed;
-
struct smb_sockaddr_in {
__be16 Port;
__be32 IPv4address;
@@ -357,7 +213,7 @@ struct file_object_buf_type1_ioctl_rsp {
} __packed;
struct resume_key_ioctl_rsp {
- __le64 ResumeKey[3];
+ __u64 ResumeKey[3];
__le32 ContextLength;
__u8 Context[4]; /* ignored, Windows sets to 4 bytes of zero */
} __packed;
@@ -386,167 +242,6 @@ struct file_sparse {
__u8 SetSparse;
} __packed;
-struct file_zero_data_information {
- __le64 FileOffset;
- __le64 BeyondFinalZero;
-} __packed;
-
-struct file_allocated_range_buffer {
- __le64 file_offset;
- __le64 length;
-} __packed;
-
-struct reparse_data_buffer {
- __le32 ReparseTag;
- __le16 ReparseDataLength;
- __u16 Reserved;
- __u8 DataBuffer[]; /* Variable Length */
-} __packed;
-
-/* SMB2 Notify Action Flags */
-#define FILE_ACTION_ADDED 0x00000001
-#define FILE_ACTION_REMOVED 0x00000002
-#define FILE_ACTION_MODIFIED 0x00000003
-#define FILE_ACTION_RENAMED_OLD_NAME 0x00000004
-#define FILE_ACTION_RENAMED_NEW_NAME 0x00000005
-#define FILE_ACTION_ADDED_STREAM 0x00000006
-#define FILE_ACTION_REMOVED_STREAM 0x00000007
-#define FILE_ACTION_MODIFIED_STREAM 0x00000008
-#define FILE_ACTION_REMOVED_BY_DELETE 0x00000009
-
-#define SMB2_LOCKFLAG_SHARED 0x0001
-#define SMB2_LOCKFLAG_EXCLUSIVE 0x0002
-#define SMB2_LOCKFLAG_UNLOCK 0x0004
-#define SMB2_LOCKFLAG_FAIL_IMMEDIATELY 0x0010
-#define SMB2_LOCKFLAG_MASK 0x0007
-
-struct smb2_lock_element {
- __le64 Offset;
- __le64 Length;
- __le32 Flags;
- __le32 Reserved;
-} __packed;
-
-struct smb2_lock_req {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 48 */
- __le16 LockCount;
- __le32 Reserved;
- __le64 PersistentFileId;
- __le64 VolatileFileId;
- /* Followed by at least one */
- struct smb2_lock_element locks[1];
-} __packed;
-
-struct smb2_lock_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 4 */
- __le16 Reserved;
-} __packed;
-
-struct smb2_echo_req {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 4 */
- __u16 Reserved;
-} __packed;
-
-struct smb2_echo_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 4 */
- __u16 Reserved;
-} __packed;
-
-/* search (query_directory) Flags field */
-#define SMB2_RESTART_SCANS 0x01
-#define SMB2_RETURN_SINGLE_ENTRY 0x02
-#define SMB2_INDEX_SPECIFIED 0x04
-#define SMB2_REOPEN 0x10
-
-struct smb2_query_directory_req {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 33 */
- __u8 FileInformationClass;
- __u8 Flags;
- __le32 FileIndex;
- __le64 PersistentFileId;
- __le64 VolatileFileId;
- __le16 FileNameOffset;
- __le16 FileNameLength;
- __le32 OutputBufferLength;
- __u8 Buffer[1];
-} __packed;
-
-struct smb2_query_directory_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 9 */
- __le16 OutputBufferOffset;
- __le32 OutputBufferLength;
- __u8 Buffer[1];
-} __packed;
-
-/* Possible InfoType values */
-#define SMB2_O_INFO_FILE 0x01
-#define SMB2_O_INFO_FILESYSTEM 0x02
-#define SMB2_O_INFO_SECURITY 0x03
-#define SMB2_O_INFO_QUOTA 0x04
-
-/* Security info type additionalinfo flags. See MS-SMB2 (2.2.37) or MS-DTYP */
-#define OWNER_SECINFO 0x00000001
-#define GROUP_SECINFO 0x00000002
-#define DACL_SECINFO 0x00000004
-#define SACL_SECINFO 0x00000008
-#define LABEL_SECINFO 0x00000010
-#define ATTRIBUTE_SECINFO 0x00000020
-#define SCOPE_SECINFO 0x00000040
-#define BACKUP_SECINFO 0x00010000
-#define UNPROTECTED_SACL_SECINFO 0x10000000
-#define UNPROTECTED_DACL_SECINFO 0x20000000
-#define PROTECTED_SACL_SECINFO 0x40000000
-#define PROTECTED_DACL_SECINFO 0x80000000
-
-struct smb2_query_info_req {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 41 */
- __u8 InfoType;
- __u8 FileInfoClass;
- __le32 OutputBufferLength;
- __le16 InputBufferOffset;
- __u16 Reserved;
- __le32 InputBufferLength;
- __le32 AdditionalInformation;
- __le32 Flags;
- __le64 PersistentFileId;
- __le64 VolatileFileId;
- __u8 Buffer[1];
-} __packed;
-
-struct smb2_query_info_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 9 */
- __le16 OutputBufferOffset;
- __le32 OutputBufferLength;
- __u8 Buffer[1];
-} __packed;
-
-struct smb2_set_info_req {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 33 */
- __u8 InfoType;
- __u8 FileInfoClass;
- __le32 BufferLength;
- __le16 BufferOffset;
- __u16 Reserved;
- __le32 AdditionalInformation;
- __le64 PersistentFileId;
- __le64 VolatileFileId;
- __u8 Buffer[1];
-} __packed;
-
-struct smb2_set_info_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 2 */
-} __packed;
-
/* FILE Info response size */
#define FILE_DIRECTORY_INFORMATION_SIZE 1
#define FILE_FULL_DIRECTORY_INFORMATION_SIZE 2
@@ -602,145 +297,11 @@ struct fs_type_info {
long magic_number;
} __packed;
-struct smb2_oplock_break {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 24 */
- __u8 OplockLevel;
- __u8 Reserved;
- __le32 Reserved2;
- __le64 PersistentFid;
- __le64 VolatileFid;
-} __packed;
-
-#define SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED cpu_to_le32(0x01)
-
-struct smb2_lease_break {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 44 */
- __le16 Epoch;
- __le32 Flags;
- __u8 LeaseKey[16];
- __le32 CurrentLeaseState;
- __le32 NewLeaseState;
- __le32 BreakReason;
- __le32 AccessMaskHint;
- __le32 ShareMaskHint;
-} __packed;
-
-struct smb2_lease_ack {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 36 */
- __le16 Reserved;
- __le32 Flags;
- __u8 LeaseKey[16];
- __le32 LeaseState;
- __le64 LeaseDuration;
-} __packed;
-
/*
- * PDU infolevel structure definitions
+ * PDU query infolevel structure definitions
* BB consider moving to a different header
*/
-/* File System Information Classes */
-#define FS_VOLUME_INFORMATION 1 /* Query */
-#define FS_LABEL_INFORMATION 2 /* Set */
-#define FS_SIZE_INFORMATION 3 /* Query */
-#define FS_DEVICE_INFORMATION 4 /* Query */
-#define FS_ATTRIBUTE_INFORMATION 5 /* Query */
-#define FS_CONTROL_INFORMATION 6 /* Query, Set */
-#define FS_FULL_SIZE_INFORMATION 7 /* Query */
-#define FS_OBJECT_ID_INFORMATION 8 /* Query, Set */
-#define FS_DRIVER_PATH_INFORMATION 9 /* Query */
-#define FS_SECTOR_SIZE_INFORMATION 11 /* SMB3 or later. Query */
-#define FS_POSIX_INFORMATION 100 /* SMB3.1.1 POSIX. Query */
-
-struct smb2_fs_full_size_info {
- __le64 TotalAllocationUnits;
- __le64 CallerAvailableAllocationUnits;
- __le64 ActualAvailableAllocationUnits;
- __le32 SectorsPerAllocationUnit;
- __le32 BytesPerSector;
-} __packed;
-
-#define SSINFO_FLAGS_ALIGNED_DEVICE 0x00000001
-#define SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE 0x00000002
-#define SSINFO_FLAGS_NO_SEEK_PENALTY 0x00000004
-#define SSINFO_FLAGS_TRIM_ENABLED 0x00000008
-
-/* sector size info struct */
-struct smb3_fs_ss_info {
- __le32 LogicalBytesPerSector;
- __le32 PhysicalBytesPerSectorForAtomicity;
- __le32 PhysicalBytesPerSectorForPerf;
- __le32 FSEffPhysicalBytesPerSectorForAtomicity;
- __le32 Flags;
- __le32 ByteOffsetForSectorAlignment;
- __le32 ByteOffsetForPartitionAlignment;
-} __packed;
-
-/* File System Control Information */
-struct smb2_fs_control_info {
- __le64 FreeSpaceStartFiltering;
- __le64 FreeSpaceThreshold;
- __le64 FreeSpaceStopFiltering;
- __le64 DefaultQuotaThreshold;
- __le64 DefaultQuotaLimit;
- __le32 FileSystemControlFlags;
- __le32 Padding;
-} __packed;
-
-/* partial list of QUERY INFO levels */
-#define FILE_DIRECTORY_INFORMATION 1
-#define FILE_FULL_DIRECTORY_INFORMATION 2
-#define FILE_BOTH_DIRECTORY_INFORMATION 3
-#define FILE_BASIC_INFORMATION 4
-#define FILE_STANDARD_INFORMATION 5
-#define FILE_INTERNAL_INFORMATION 6
-#define FILE_EA_INFORMATION 7
-#define FILE_ACCESS_INFORMATION 8
-#define FILE_NAME_INFORMATION 9
-#define FILE_RENAME_INFORMATION 10
-#define FILE_LINK_INFORMATION 11
-#define FILE_NAMES_INFORMATION 12
-#define FILE_DISPOSITION_INFORMATION 13
-#define FILE_POSITION_INFORMATION 14
-#define FILE_FULL_EA_INFORMATION 15
-#define FILE_MODE_INFORMATION 16
-#define FILE_ALIGNMENT_INFORMATION 17
-#define FILE_ALL_INFORMATION 18
-#define FILE_ALLOCATION_INFORMATION 19
-#define FILE_END_OF_FILE_INFORMATION 20
-#define FILE_ALTERNATE_NAME_INFORMATION 21
-#define FILE_STREAM_INFORMATION 22
-#define FILE_PIPE_INFORMATION 23
-#define FILE_PIPE_LOCAL_INFORMATION 24
-#define FILE_PIPE_REMOTE_INFORMATION 25
-#define FILE_MAILSLOT_QUERY_INFORMATION 26
-#define FILE_MAILSLOT_SET_INFORMATION 27
-#define FILE_COMPRESSION_INFORMATION 28
-#define FILE_OBJECT_ID_INFORMATION 29
-/* Number 30 not defined in documents */
-#define FILE_MOVE_CLUSTER_INFORMATION 31
-#define FILE_QUOTA_INFORMATION 32
-#define FILE_REPARSE_POINT_INFORMATION 33
-#define FILE_NETWORK_OPEN_INFORMATION 34
-#define FILE_ATTRIBUTE_TAG_INFORMATION 35
-#define FILE_TRACKING_INFORMATION 36
-#define FILEID_BOTH_DIRECTORY_INFORMATION 37
-#define FILEID_FULL_DIRECTORY_INFORMATION 38
-#define FILE_VALID_DATA_LENGTH_INFORMATION 39
-#define FILE_SHORT_NAME_INFORMATION 40
-#define FILE_SFIO_RESERVE_INFORMATION 44
-#define FILE_SFIO_VOLUME_INFORMATION 45
-#define FILE_HARD_LINK_INFORMATION 46
-#define FILE_NORMALIZED_NAME_INFORMATION 48
-#define FILEID_GLOBAL_TX_DIRECTORY_INFORMATION 50
-#define FILE_STANDARD_LINK_INFORMATION 54
-
-#define OP_BREAK_STRUCT_SIZE_20 24
-#define OP_BREAK_STRUCT_SIZE_21 36
-
struct smb2_file_access_info {
__le32 AccessFlags;
} __packed;
@@ -749,56 +310,6 @@ struct smb2_file_alignment_info {
__le32 AlignmentRequirement;
} __packed;
-struct smb2_file_internal_info {
- __le64 IndexNumber;
-} __packed; /* level 6 Query */
-
-struct smb2_file_rename_info { /* encoding of request for level 10 */
- __u8 ReplaceIfExists; /* 1 = replace existing target with new */
- /* 0 = fail if target already exists */
- __u8 Reserved[7];
- __u64 RootDirectory; /* MBZ for network operations (why says spec?) */
- __le32 FileNameLength;
- char FileName[0]; /* New name to be assigned */
-} __packed; /* level 10 Set */
-
-struct smb2_file_link_info { /* encoding of request for level 11 */
- __u8 ReplaceIfExists; /* 1 = replace existing link with new */
- /* 0 = fail if link already exists */
- __u8 Reserved[7];
- __u64 RootDirectory; /* MBZ for network operations (why says spec?) */
- __le32 FileNameLength;
- char FileName[0]; /* Name to be assigned to new link */
-} __packed; /* level 11 Set */
-
-/*
- * This level 18, although with struct with same name is different from cifs
- * level 0x107. Level 0x107 has an extra u64 between AccessFlags and
- * CurrentByteOffset.
- */
-struct smb2_file_all_info { /* data block encoding of response to level 18 */
- __le64 CreationTime; /* Beginning of FILE_BASIC_INFO equivalent */
- __le64 LastAccessTime;
- __le64 LastWriteTime;
- __le64 ChangeTime;
- __le32 Attributes;
- __u32 Pad1; /* End of FILE_BASIC_INFO_INFO equivalent */
- __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */
- __le64 EndOfFile; /* size ie offset to first free byte in file */
- __le32 NumberOfLinks; /* hard links */
- __u8 DeletePending;
- __u8 Directory;
- __u16 Pad2; /* End of FILE_STANDARD_INFO equivalent */
- __le64 IndexNumber;
- __le32 EASize;
- __le32 AccessFlags;
- __le64 CurrentByteOffset;
- __le32 Mode;
- __le32 AlignmentRequirement;
- __le32 FileNameLength;
- char FileName[1];
-} __packed; /* level 18 Query */
-
struct smb2_file_basic_info { /* data block encoding of response to level 18 */
__le64 CreationTime; /* Beginning of FILE_BASIC_INFO equivalent */
__le64 LastAccessTime;
@@ -810,7 +321,7 @@ struct smb2_file_basic_info { /* data block encoding of response to level 18 */
struct smb2_file_alt_name_info {
__le32 FileNameLength;
- char FileName[0];
+ char FileName[];
} __packed;
struct smb2_file_stream_info {
@@ -818,13 +329,9 @@ struct smb2_file_stream_info {
__le32 StreamNameLength;
__le64 StreamSize;
__le64 StreamAllocationSize;
- char StreamName[0];
+ char StreamName[];
} __packed;
-struct smb2_file_eof_info { /* encoding of request for level 10 */
- __le64 EndOfFile; /* new end of file value */
-} __packed; /* level 20 Set */
-
struct smb2_file_ntwrk_info {
__le64 CreationTime;
__le64 LastAccessTime;
@@ -915,34 +422,6 @@ struct create_sd_buf_req {
struct smb_ntsd ntsd;
} __packed;
-/* Find File infolevels */
-#define SMB_FIND_FILE_POSIX_INFO 0x064
-
-/* Level 100 query info */
-struct smb311_posix_qinfo {
- __le64 CreationTime;
- __le64 LastAccessTime;
- __le64 LastWriteTime;
- __le64 ChangeTime;
- __le64 EndOfFile;
- __le64 AllocationSize;
- __le32 DosAttributes;
- __le64 Inode;
- __le32 DeviceId;
- __le32 Zero;
- /* beginning of POSIX Create Context Response */
- __le32 HardLinks;
- __le32 ReparseTag;
- __le32 Mode;
- u8 Sids[];
- /*
- * var sized owner SID
- * var sized group SID
- * le32 filenamelength
- * u8 filename[]
- */
-} __packed;
-
struct smb2_posix_info {
__le32 NextEntryOffset;
__u32 Ignored;
diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c
index ba5a22bc2e6d..e646d79554b8 100644
--- a/fs/ksmbd/transport_rdma.c
+++ b/fs/ksmbd/transport_rdma.c
@@ -211,7 +211,7 @@ struct smb_direct_rdma_rw_msg {
struct completion *completion;
struct rdma_rw_ctx rw_ctx;
struct sg_table sgt;
- struct scatterlist sg_list[0];
+ struct scatterlist sg_list[];
};
static inline int get_buf_page_count(void *buf, int size)
diff --git a/fs/ksmbd/transport_tcp.c b/fs/ksmbd/transport_tcp.c
index 82a1429bbe12..8fef9de787d3 100644
--- a/fs/ksmbd/transport_tcp.c
+++ b/fs/ksmbd/transport_tcp.c
@@ -476,7 +476,7 @@ static int ksmbd_netdev_event(struct notifier_block *nb, unsigned long event,
switch (event) {
case NETDEV_UP:
- if (netdev->priv_flags & IFF_BRIDGE_PORT)
+ if (netif_is_bridge_port(netdev))
return NOTIFY_OK;
list_for_each_entry(iface, &iface_list, entry) {
@@ -585,7 +585,7 @@ int ksmbd_tcp_set_interfaces(char *ifc_list, int ifc_list_sz)
rtnl_lock();
for_each_netdev(&init_net, netdev) {
- if (netdev->priv_flags & IFF_BRIDGE_PORT)
+ if (netif_is_bridge_port(netdev))
continue;
if (!alloc_iface(kstrdup(netdev->name, GFP_KERNEL)))
return -ENOMEM;
diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index 19d36393974c..dcdd07c6efff 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -11,7 +11,6 @@
#include <linux/writeback.h>
#include <linux/xattr.h>
#include <linux/falloc.h>
-#include <linux/genhd.h>
#include <linux/fsnotify.h>
#include <linux/dcache.h>
#include <linux/slab.h>
@@ -399,8 +398,7 @@ int ksmbd_vfs_read(struct ksmbd_work *work, struct ksmbd_file *fp, size_t count,
nbytes = kernel_read(filp, rbuf, count, pos);
if (nbytes < 0) {
- pr_err("smb read failed for (%s), err = %zd\n",
- fp->filename, nbytes);
+ pr_err("smb read failed, err = %zd\n", nbytes);
return nbytes;
}
@@ -876,8 +874,7 @@ int ksmbd_vfs_truncate(struct ksmbd_work *work,
err = vfs_truncate(&filp->f_path, size);
if (err)
- pr_err("truncate failed for filename : %s err %d\n",
- fp->filename, err);
+ pr_err("truncate failed, err %d\n", err);
return err;
}
diff --git a/fs/ksmbd/vfs_cache.c b/fs/ksmbd/vfs_cache.c
index 29c1db66bd0f..c4d59d2735f0 100644
--- a/fs/ksmbd/vfs_cache.c
+++ b/fs/ksmbd/vfs_cache.c
@@ -328,7 +328,6 @@ static void __ksmbd_close_fd(struct ksmbd_file_table *ft, struct ksmbd_file *fp)
kfree(smb_lock);
}
- kfree(fp->filename);
if (ksmbd_stream_fd(fp))
kfree(fp->stream.name);
kmem_cache_free(filp_cache, fp);
@@ -497,6 +496,7 @@ struct ksmbd_file *ksmbd_lookup_fd_inode(struct inode *inode)
list_for_each_entry(lfp, &ci->m_fp_list, node) {
if (inode == file_inode(lfp->filp)) {
atomic_dec(&ci->m_count);
+ lfp = ksmbd_fp_get(lfp);
read_unlock(&ci->m_lock);
return lfp;
}
diff --git a/fs/ksmbd/vfs_cache.h b/fs/ksmbd/vfs_cache.h
index 36239ce31afd..fcb13413fa8d 100644
--- a/fs/ksmbd/vfs_cache.h
+++ b/fs/ksmbd/vfs_cache.h
@@ -62,7 +62,6 @@ struct ksmbd_inode {
struct ksmbd_file {
struct file *filp;
- char *filename;
u64 persistent_id;
u64 volatile_id;
diff --git a/fs/ksmbd/xattr.h b/fs/ksmbd/xattr.h
index 8857c01093d9..16499ca5c82d 100644
--- a/fs/ksmbd/xattr.h
+++ b/fs/ksmbd/xattr.h
@@ -76,7 +76,7 @@ struct xattr_acl_entry {
struct xattr_smb_acl {
int count;
int next;
- struct xattr_acl_entry entries[0];
+ struct xattr_acl_entry entries[];
};
/* 64bytes hash in xattr_ntacl is computed with sha256 */
diff --git a/fs/libfs.c b/fs/libfs.c
index 974125270a42..e64bdedef168 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -631,7 +631,7 @@ const struct address_space_operations ram_aops = {
.readpage = simple_readpage,
.write_begin = simple_write_begin,
.write_end = simple_write_end,
- .set_page_dirty = __set_page_dirty_no_writeback,
+ .dirty_folio = noop_dirty_folio,
};
EXPORT_SYMBOL(ram_aops);
@@ -1198,17 +1198,6 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
}
EXPORT_SYMBOL(noop_fsync);
-void noop_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
-{
- /*
- * There is no page cache to invalidate in the dax case, however
- * we need this callback defined to prevent falling back to
- * block_invalidatepage() in do_invalidatepage().
- */
-}
-EXPORT_SYMBOL_GPL(noop_invalidatepage);
-
ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
/*
@@ -1231,7 +1220,7 @@ EXPORT_SYMBOL(kfree_link);
struct inode *alloc_anon_inode(struct super_block *s)
{
static const struct address_space_operations anon_aops = {
- .set_page_dirty = __set_page_dirty_no_writeback,
+ .dirty_folio = noop_dirty_folio,
};
struct inode *inode = new_inode_pseudo(s);
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 0475c5a5d061..59ef8a1f843f 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -184,8 +184,7 @@ lockd(void *vrqstp)
dprintk("lockd_down: service stopped\n");
svc_exit_thread(rqstp);
-
- module_put_and_kthread_exit(0);
+ return 0;
}
static int create_lockd_listener(struct svc_serv *serv, const char *name,
@@ -197,8 +196,8 @@ static int create_lockd_listener(struct svc_serv *serv, const char *name,
xprt = svc_find_xprt(serv, name, net, family, 0);
if (xprt == NULL)
- return svc_create_xprt(serv, name, net, family, port,
- SVC_SOCK_DEFAULTS, cred);
+ return svc_xprt_create(serv, name, net, family, port,
+ SVC_SOCK_DEFAULTS, cred);
svc_xprt_put(xprt);
return 0;
}
@@ -248,7 +247,8 @@ out_err:
if (warned++ == 0)
printk(KERN_WARNING
"lockd_up: makesock failed, error=%d\n", err);
- svc_shutdown_net(serv, net);
+ svc_xprt_destroy_all(serv, net);
+ svc_rpcb_cleanup(serv, net);
return err;
}
@@ -286,9 +286,8 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net)
nlm_shutdown_hosts_net(net);
cancel_delayed_work_sync(&ln->grace_period_end);
locks_end_grace(&ln->lockd_manager);
- svc_shutdown_net(serv, net);
- dprintk("%s: per-net data destroyed; net=%x\n",
- __func__, net->ns.inum);
+ svc_xprt_destroy_all(serv, net);
+ svc_rpcb_cleanup(serv, net);
}
} else {
pr_err("%s: no users! net=%x\n",
@@ -350,13 +349,6 @@ static struct notifier_block lockd_inet6addr_notifier = {
};
#endif
-static const struct svc_serv_ops lockd_sv_ops = {
- .svo_shutdown = svc_rpcb_cleanup,
- .svo_function = lockd,
- .svo_enqueue_xprt = svc_xprt_do_enqueue,
- .svo_module = THIS_MODULE,
-};
-
static int lockd_get(void)
{
struct svc_serv *serv;
@@ -380,7 +372,7 @@ static int lockd_get(void)
nlm_timeout = LOCKD_DFLT_TIMEO;
nlmsvc_timeout = nlm_timeout * HZ;
- serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, &lockd_sv_ops);
+ serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, lockd);
if (!serv) {
printk(KERN_WARNING "lockd_up: create service failed\n");
return -ENOMEM;
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index a71f1cf894b9..f1a6610e4ee6 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -63,7 +63,7 @@ static struct kmem_cache * minix_inode_cachep;
static struct inode *minix_alloc_inode(struct super_block *sb)
{
struct minix_inode_info *ei;
- ei = kmem_cache_alloc(minix_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, minix_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
return &ei->vfs_inode;
@@ -442,12 +442,14 @@ static sector_t minix_bmap(struct address_space *mapping, sector_t block)
}
static const struct address_space_operations minix_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = minix_readpage,
.writepage = minix_writepage,
.write_begin = minix_write_begin,
.write_end = generic_write_end,
- .bmap = minix_bmap
+ .bmap = minix_bmap,
+ .direct_IO = noop_direct_IO
};
static const struct inode_operations minix_symlink_inode_operations = {
diff --git a/fs/mpage.c b/fs/mpage.c
index 87f5cfef6caa..1fe56f8c495f 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -57,38 +57,14 @@ static void mpage_end_io(struct bio *bio)
bio_put(bio);
}
-static struct bio *mpage_bio_submit(int op, int op_flags, struct bio *bio)
+static struct bio *mpage_bio_submit(struct bio *bio)
{
bio->bi_end_io = mpage_end_io;
- bio_set_op_attrs(bio, op, op_flags);
guard_bio_eod(bio);
submit_bio(bio);
return NULL;
}
-static struct bio *
-mpage_alloc(struct block_device *bdev,
- sector_t first_sector, int nr_vecs,
- gfp_t gfp_flags)
-{
- struct bio *bio;
-
- /* Restrict the given (page cache) mask for slab allocations */
- gfp_flags &= GFP_KERNEL;
- bio = bio_alloc(gfp_flags, nr_vecs);
-
- if (bio == NULL && (current->flags & PF_MEMALLOC)) {
- while (!bio && (nr_vecs /= 2))
- bio = bio_alloc(gfp_flags, nr_vecs);
- }
-
- if (bio) {
- bio_set_dev(bio, bdev);
- bio->bi_iter.bi_sector = first_sector;
- }
- return bio;
-}
-
/*
* support function for mpage_readahead. The fs supplied get_block might
* return an up to date buffer. This is used to map that buffer into
@@ -169,17 +145,14 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
struct block_device *bdev = NULL;
int length;
int fully_mapped = 1;
- int op_flags;
+ int op = REQ_OP_READ;
unsigned nblocks;
unsigned relative_block;
- gfp_t gfp;
+ gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
if (args->is_readahead) {
- op_flags = REQ_RAHEAD;
- gfp = readahead_gfp_mask(page->mapping);
- } else {
- op_flags = 0;
- gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
+ op |= REQ_RAHEAD;
+ gfp |= __GFP_NORETRY | __GFP_NOWARN;
}
if (page_has_buffers(page))
@@ -287,7 +260,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
* This page will go to BIO. Do we need to send this BIO off first?
*/
if (args->bio && (args->last_block_in_bio != blocks[0] - 1))
- args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio);
+ args->bio = mpage_bio_submit(args->bio);
alloc_new:
if (args->bio == NULL) {
@@ -296,15 +269,16 @@ alloc_new:
page))
goto out;
}
- args->bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
- bio_max_segs(args->nr_pages), gfp);
+ args->bio = bio_alloc(bdev, bio_max_segs(args->nr_pages), op,
+ gfp);
if (args->bio == NULL)
goto confused;
+ args->bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
}
length = first_hole << blkbits;
if (bio_add_page(args->bio, page, length, 0) < length) {
- args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio);
+ args->bio = mpage_bio_submit(args->bio);
goto alloc_new;
}
@@ -312,7 +286,7 @@ alloc_new:
nblocks = map_bh->b_size >> blkbits;
if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
(first_hole != blocks_per_page))
- args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio);
+ args->bio = mpage_bio_submit(args->bio);
else
args->last_block_in_bio = blocks[blocks_per_page - 1];
out:
@@ -320,7 +294,7 @@ out:
confused:
if (args->bio)
- args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio);
+ args->bio = mpage_bio_submit(args->bio);
if (!PageUptodate(page))
block_read_full_page(page, args->get_block);
else
@@ -383,7 +357,7 @@ void mpage_readahead(struct readahead_control *rac, get_block_t get_block)
put_page(page);
}
if (args.bio)
- mpage_bio_submit(REQ_OP_READ, REQ_RAHEAD, args.bio);
+ mpage_bio_submit(args.bio);
}
EXPORT_SYMBOL(mpage_readahead);
@@ -400,7 +374,7 @@ int mpage_readpage(struct page *page, get_block_t get_block)
args.bio = do_mpage_readpage(&args);
if (args.bio)
- mpage_bio_submit(REQ_OP_READ, 0, args.bio);
+ mpage_bio_submit(args.bio);
return 0;
}
EXPORT_SYMBOL(mpage_readpage);
@@ -491,7 +465,6 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
struct buffer_head map_bh;
loff_t i_size = i_size_read(inode);
int ret = 0;
- int op_flags = wbc_to_write_flags(wbc);
if (page_has_buffers(page)) {
struct buffer_head *head = page_buffers(page);
@@ -504,7 +477,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
if (!buffer_mapped(bh)) {
/*
* unmapped dirty buffers are created by
- * __set_page_dirty_buffers -> mmapped data
+ * block_dirty_folio -> mmapped data
*/
if (buffer_dirty(bh))
goto confused;
@@ -599,7 +572,7 @@ page_is_mapped:
* This page will go to BIO. Do we need to send this BIO off first?
*/
if (bio && mpd->last_block_in_bio != blocks[0] - 1)
- bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
+ bio = mpage_bio_submit(bio);
alloc_new:
if (bio == NULL) {
@@ -608,13 +581,11 @@ alloc_new:
page, wbc))
goto out;
}
- bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
- BIO_MAX_VECS, GFP_NOFS|__GFP_HIGH);
- if (bio == NULL)
- goto confused;
-
+ bio = bio_alloc(bdev, BIO_MAX_VECS,
+ REQ_OP_WRITE | wbc_to_write_flags(wbc),
+ GFP_NOFS);
+ bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
wbc_init_bio(wbc, bio);
- bio->bi_write_hint = inode->i_write_hint;
}
/*
@@ -625,7 +596,7 @@ alloc_new:
wbc_account_cgroup_owner(wbc, page, PAGE_SIZE);
length = first_unmapped << blkbits;
if (bio_add_page(bio, page, length, 0) < length) {
- bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
+ bio = mpage_bio_submit(bio);
goto alloc_new;
}
@@ -635,7 +606,7 @@ alloc_new:
set_page_writeback(page);
unlock_page(page);
if (boundary || (first_unmapped != blocks_per_page)) {
- bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
+ bio = mpage_bio_submit(bio);
if (boundary_block) {
write_boundary_block(boundary_bdev,
boundary_block, 1 << blkbits);
@@ -647,7 +618,7 @@ alloc_new:
confused:
if (bio)
- bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
+ bio = mpage_bio_submit(bio);
if (mpd->use_writepage) {
ret = mapping->a_ops->writepage(page, wbc);
@@ -703,11 +674,8 @@ mpage_writepages(struct address_space *mapping,
};
ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
- if (mpd.bio) {
- int op_flags = (wbc->sync_mode == WB_SYNC_ALL ?
- REQ_SYNC : 0);
- mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio);
- }
+ if (mpd.bio)
+ mpage_bio_submit(mpd.bio);
}
blk_finish_plug(&plug);
return ret;
@@ -724,11 +692,8 @@ int mpage_writepage(struct page *page, get_block_t get_block,
.use_writepage = 0,
};
int ret = __mpage_writepage(page, wbc, &mpd);
- if (mpd.bio) {
- int op_flags = (wbc->sync_mode == WB_SYNC_ALL ?
- REQ_SYNC : 0);
- mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio);
- }
+ if (mpd.bio)
+ mpage_bio_submit(mpd.bio);
return ret;
}
EXPORT_SYMBOL(mpage_writepage);
diff --git a/fs/namei.c b/fs/namei.c
index 3f1829b3ab5b..509657fdf4f5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3673,18 +3673,14 @@ static struct dentry *filename_create(int dfd, struct filename *name,
{
struct dentry *dentry = ERR_PTR(-EEXIST);
struct qstr last;
+ bool want_dir = lookup_flags & LOOKUP_DIRECTORY;
+ unsigned int reval_flag = lookup_flags & LOOKUP_REVAL;
+ unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL;
int type;
int err2;
int error;
- bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);
- /*
- * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any
- * other flags passed in are ignored!
- */
- lookup_flags &= LOOKUP_REVAL;
-
- error = filename_parentat(dfd, name, lookup_flags, path, &last, &type);
+ error = filename_parentat(dfd, name, reval_flag, path, &last, &type);
if (error)
return ERR_PTR(error);
@@ -3698,11 +3694,13 @@ static struct dentry *filename_create(int dfd, struct filename *name,
/* don't fail immediately if it's r/o, at least try to report other errors */
err2 = mnt_want_write(path->mnt);
/*
- * Do the final lookup.
+ * Do the final lookup. Suppress 'create' if there is a trailing
+ * '/', and a directory wasn't requested.
*/
- lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL;
+ if (last.name[last.len] && !want_dir)
+ create_flags = 0;
inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
- dentry = __lookup_hash(&last, path->dentry, lookup_flags);
+ dentry = __lookup_hash(&last, path->dentry, reval_flag | create_flags);
if (IS_ERR(dentry))
goto unlock;
@@ -3716,7 +3714,7 @@ static struct dentry *filename_create(int dfd, struct filename *name,
* all is fine. Let's be bastards - you had / on the end, you've
* been asking for (non-existent) directory. -ENOENT for you.
*/
- if (unlikely(!is_dir && last.name[last.len])) {
+ if (unlikely(!create_flags)) {
error = -ENOENT;
goto fail;
}
diff --git a/fs/namespace.c b/fs/namespace.c
index de6fae84f1a1..afe2b64b14f1 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -344,8 +344,24 @@ int __mnt_want_write(struct vfsmount *m)
* incremented count after it has set MNT_WRITE_HOLD.
*/
smp_mb();
- while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
- cpu_relax();
+ might_lock(&mount_lock.lock);
+ while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ cpu_relax();
+ } else {
+ /*
+ * This prevents priority inversion, if the task
+ * setting MNT_WRITE_HOLD got preempted on a remote
+ * CPU, and it prevents life lock if the task setting
+ * MNT_WRITE_HOLD has a lower priority and is bound to
+ * the same CPU as the task that is spinning here.
+ */
+ preempt_enable();
+ lock_mount_hash();
+ unlock_mount_hash();
+ preempt_disable();
+ }
+ }
/*
* After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
* be set to match its requirements. So we must not load that until
@@ -563,12 +579,9 @@ int sb_prepare_remount_readonly(struct super_block *sb)
lock_mount_hash();
list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
- mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
- smp_mb();
- if (mnt_get_writers(mnt) > 0) {
- err = -EBUSY;
+ err = mnt_hold_writers(mnt);
+ if (err)
break;
- }
}
}
if (!err && atomic_long_read(&sb->s_remove_count))
@@ -2099,22 +2112,23 @@ static int invent_group_ids(struct mount *mnt, bool recurse)
int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
{
unsigned int max = READ_ONCE(sysctl_mount_max);
- unsigned int mounts = 0, old, pending, sum;
+ unsigned int mounts = 0;
struct mount *p;
+ if (ns->mounts >= max)
+ return -ENOSPC;
+ max -= ns->mounts;
+ if (ns->pending_mounts >= max)
+ return -ENOSPC;
+ max -= ns->pending_mounts;
+
for (p = mnt; p; p = next_mnt(p, mnt))
mounts++;
- old = ns->mounts;
- pending = ns->pending_mounts;
- sum = old + pending;
- if ((old > sum) ||
- (pending > sum) ||
- (max < sum) ||
- (mounts > (max - sum)))
+ if (mounts > max)
return -ENOSPC;
- ns->pending_mounts = pending + mounts;
+ ns->pending_mounts += mounts;
return 0;
}
@@ -2597,6 +2611,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *
struct super_block *sb = mnt->mnt_sb;
if (!__mnt_is_readonly(mnt) &&
+ (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) &&
(ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
char *buf = (char *)__get_free_page(GFP_KERNEL);
char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM);
@@ -2611,6 +2626,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *
tm.tm_year+1900, (unsigned long long)sb->s_time_max);
free_page((unsigned long)buf);
+ sb->s_iflags |= SB_I_TS_EXPIRY_WARNED;
}
}
@@ -2906,7 +2922,7 @@ static int do_move_mount_old(struct path *path, const char *old_name)
* add a mount into a namespace's mount tree
*/
static int do_add_mount(struct mount *newmnt, struct mountpoint *mp,
- struct path *path, int mnt_flags)
+ const struct path *path, int mnt_flags)
{
struct mount *parent = real_mount(path->mnt);
@@ -3029,7 +3045,7 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
return err;
}
-int finish_automount(struct vfsmount *m, struct path *path)
+int finish_automount(struct vfsmount *m, const struct path *path)
{
struct dentry *dentry = path->dentry;
struct mountpoint *mp;
@@ -3998,46 +4014,69 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
return 0;
}
-static struct mount *mount_setattr_prepare(struct mount_kattr *kattr,
- struct mount *mnt, int *err)
+/**
+ * mnt_allow_writers() - check whether the attribute change allows writers
+ * @kattr: the new mount attributes
+ * @mnt: the mount to which @kattr will be applied
+ *
+ * Check whether thew new mount attributes in @kattr allow concurrent writers.
+ *
+ * Return: true if writers need to be held, false if not
+ */
+static inline bool mnt_allow_writers(const struct mount_kattr *kattr,
+ const struct mount *mnt)
{
- struct mount *m = mnt, *last = NULL;
+ return !(kattr->attr_set & MNT_READONLY) ||
+ (mnt->mnt.mnt_flags & MNT_READONLY);
+}
- if (!is_mounted(&m->mnt)) {
- *err = -EINVAL;
- goto out;
- }
+static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
+{
+ struct mount *m;
+ int err;
- if (!(mnt_has_parent(m) ? check_mnt(m) : is_anon_ns(m->mnt_ns))) {
- *err = -EINVAL;
- goto out;
- }
+ for (m = mnt; m; m = next_mnt(m, mnt)) {
+ if (!can_change_locked_flags(m, recalc_flags(kattr, m))) {
+ err = -EPERM;
+ break;
+ }
- do {
- unsigned int flags;
+ err = can_idmap_mount(kattr, m);
+ if (err)
+ break;
- flags = recalc_flags(kattr, m);
- if (!can_change_locked_flags(m, flags)) {
- *err = -EPERM;
- goto out;
+ if (!mnt_allow_writers(kattr, m)) {
+ err = mnt_hold_writers(m);
+ if (err)
+ break;
}
- *err = can_idmap_mount(kattr, m);
- if (*err)
- goto out;
+ if (!kattr->recurse)
+ return 0;
+ }
- last = m;
+ if (err) {
+ struct mount *p;
- if ((kattr->attr_set & MNT_READONLY) &&
- !(m->mnt.mnt_flags & MNT_READONLY)) {
- *err = mnt_hold_writers(m);
- if (*err)
- goto out;
+ /*
+ * If we had to call mnt_hold_writers() MNT_WRITE_HOLD will
+ * be set in @mnt_flags. The loop unsets MNT_WRITE_HOLD for all
+ * mounts and needs to take care to include the first mount.
+ */
+ for (p = mnt; p; p = next_mnt(p, mnt)) {
+ /* If we had to hold writers unblock them. */
+ if (p->mnt.mnt_flags & MNT_WRITE_HOLD)
+ mnt_unhold_writers(p);
+
+ /*
+ * We're done once the first mount we changed got
+ * MNT_WRITE_HOLD unset.
+ */
+ if (p == m)
+ break;
}
- } while (kattr->recurse && (m = next_mnt(m, mnt)));
-
-out:
- return last;
+ }
+ return err;
}
static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
@@ -4065,48 +4104,32 @@ static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
put_user_ns(old_mnt_userns);
}
-static void mount_setattr_commit(struct mount_kattr *kattr,
- struct mount *mnt, struct mount *last,
- int err)
+static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
{
- struct mount *m = mnt;
+ struct mount *m;
- do {
- if (!err) {
- unsigned int flags;
+ for (m = mnt; m; m = next_mnt(m, mnt)) {
+ unsigned int flags;
- do_idmap_mount(kattr, m);
- flags = recalc_flags(kattr, m);
- WRITE_ONCE(m->mnt.mnt_flags, flags);
- }
+ do_idmap_mount(kattr, m);
+ flags = recalc_flags(kattr, m);
+ WRITE_ONCE(m->mnt.mnt_flags, flags);
- /*
- * We either set MNT_READONLY above so make it visible
- * before ~MNT_WRITE_HOLD or we failed to recursively
- * apply mount options.
- */
- if ((kattr->attr_set & MNT_READONLY) &&
- (m->mnt.mnt_flags & MNT_WRITE_HOLD))
+ /* If we had to hold writers unblock them. */
+ if (m->mnt.mnt_flags & MNT_WRITE_HOLD)
mnt_unhold_writers(m);
- if (!err && kattr->propagation)
+ if (kattr->propagation)
change_mnt_propagation(m, kattr->propagation);
-
- /*
- * On failure, only cleanup until we found the first mount
- * we failed to handle.
- */
- if (err && m == last)
+ if (!kattr->recurse)
break;
- } while (kattr->recurse && (m = next_mnt(m, mnt)));
-
- if (!err)
- touch_mnt_namespace(mnt->mnt_ns);
+ }
+ touch_mnt_namespace(mnt->mnt_ns);
}
static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
{
- struct mount *mnt = real_mount(path->mnt), *last = NULL;
+ struct mount *mnt = real_mount(path->mnt);
int err = 0;
if (path->dentry != mnt->mnt.mnt_root)
@@ -4127,16 +4150,32 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
}
}
+ err = -EINVAL;
lock_mount_hash();
+ /* Ensure that this isn't anything purely vfs internal. */
+ if (!is_mounted(&mnt->mnt))
+ goto out;
+
+ /*
+ * If this is an attached mount make sure it's located in the callers
+ * mount namespace. If it's not don't let the caller interact with it.
+ * If this is a detached mount make sure it has an anonymous mount
+ * namespace attached to it, i.e. we've created it via OPEN_TREE_CLONE.
+ */
+ if (!(mnt_has_parent(mnt) ? check_mnt(mnt) : is_anon_ns(mnt->mnt_ns)))
+ goto out;
+
/*
- * Get the mount tree in a shape where we can change mount
- * properties without failure.
+ * First, we get the mount tree in a shape where we can change mount
+ * properties without failure. If we succeeded to do so we commit all
+ * changes and if we failed we clean up.
*/
- last = mount_setattr_prepare(kattr, mnt, &err);
- if (last) /* Commit all changes or revert to the old state. */
- mount_setattr_commit(kattr, mnt, last, err);
+ err = mount_setattr_prepare(kattr, mnt);
+ if (!err)
+ mount_setattr_commit(kattr, mnt);
+out:
unlock_mount_hash();
if (kattr->propagation) {
diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
index c15bfc966d96..f684c0cd1ec5 100644
--- a/fs/netfs/Makefile
+++ b/fs/netfs/Makefile
@@ -1,5 +1,11 @@
# SPDX-License-Identifier: GPL-2.0
-netfs-y := read_helper.o stats.o
+netfs-y := \
+ buffered_read.o \
+ io.o \
+ main.o \
+ objects.o
+
+netfs-$(CONFIG_NETFS_STATS) += stats.o
obj-$(CONFIG_NETFS_SUPPORT) := netfs.o
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
new file mode 100644
index 000000000000..281a88a5b8dc
--- /dev/null
+++ b/fs/netfs/buffered_read.c
@@ -0,0 +1,428 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Network filesystem high-level buffered read support.
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/task_io_accounting_ops.h>
+#include "internal.h"
+
+/*
+ * Unlock the folios in a read operation. We need to set PG_fscache on any
+ * folios we're going to write back before we unlock them.
+ */
+void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
+{
+ struct netfs_io_subrequest *subreq;
+ struct folio *folio;
+ unsigned int iopos, account = 0;
+ pgoff_t start_page = rreq->start / PAGE_SIZE;
+ pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
+ bool subreq_failed = false;
+
+ XA_STATE(xas, &rreq->mapping->i_pages, start_page);
+
+ if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) {
+ __clear_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags);
+ list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+ __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
+ }
+ }
+
+ /* Walk through the pagecache and the I/O request lists simultaneously.
+ * We may have a mixture of cached and uncached sections and we only
+ * really want to write out the uncached sections. This is slightly
+ * complicated by the possibility that we might have huge pages with a
+ * mixture inside.
+ */
+ subreq = list_first_entry(&rreq->subrequests,
+ struct netfs_io_subrequest, rreq_link);
+ iopos = 0;
+ subreq_failed = (subreq->error < 0);
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_unlock);
+
+ rcu_read_lock();
+ xas_for_each(&xas, folio, last_page) {
+ unsigned int pgpos = (folio_index(folio) - start_page) * PAGE_SIZE;
+ unsigned int pgend = pgpos + folio_size(folio);
+ bool pg_failed = false;
+
+ for (;;) {
+ if (!subreq) {
+ pg_failed = true;
+ break;
+ }
+ if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags))
+ folio_start_fscache(folio);
+ pg_failed |= subreq_failed;
+ if (pgend < iopos + subreq->len)
+ break;
+
+ account += subreq->transferred;
+ iopos += subreq->len;
+ if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
+ subreq = list_next_entry(subreq, rreq_link);
+ subreq_failed = (subreq->error < 0);
+ } else {
+ subreq = NULL;
+ subreq_failed = false;
+ }
+ if (pgend == iopos)
+ break;
+ }
+
+ if (!pg_failed) {
+ flush_dcache_folio(folio);
+ folio_mark_uptodate(folio);
+ }
+
+ if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
+ if (folio_index(folio) == rreq->no_unlock_folio &&
+ test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags))
+ _debug("no unlock");
+ else
+ folio_unlock(folio);
+ }
+ }
+ rcu_read_unlock();
+
+ task_io_account_read(account);
+ if (rreq->netfs_ops->done)
+ rreq->netfs_ops->done(rreq);
+}
+
+static void netfs_cache_expand_readahead(struct netfs_io_request *rreq,
+ loff_t *_start, size_t *_len, loff_t i_size)
+{
+ struct netfs_cache_resources *cres = &rreq->cache_resources;
+
+ if (cres->ops && cres->ops->expand_readahead)
+ cres->ops->expand_readahead(cres, _start, _len, i_size);
+}
+
+static void netfs_rreq_expand(struct netfs_io_request *rreq,
+ struct readahead_control *ractl)
+{
+ /* Give the cache a chance to change the request parameters. The
+ * resultant request must contain the original region.
+ */
+ netfs_cache_expand_readahead(rreq, &rreq->start, &rreq->len, rreq->i_size);
+
+ /* Give the netfs a chance to change the request parameters. The
+ * resultant request must contain the original region.
+ */
+ if (rreq->netfs_ops->expand_readahead)
+ rreq->netfs_ops->expand_readahead(rreq);
+
+ /* Expand the request if the cache wants it to start earlier. Note
+ * that the expansion may get further extended if the VM wishes to
+ * insert THPs and the preferred start and/or end wind up in the middle
+ * of THPs.
+ *
+ * If this is the case, however, the THP size should be an integer
+ * multiple of the cache granule size, so we get a whole number of
+ * granules to deal with.
+ */
+ if (rreq->start != readahead_pos(ractl) ||
+ rreq->len != readahead_length(ractl)) {
+ readahead_expand(ractl, rreq->start, rreq->len);
+ rreq->start = readahead_pos(ractl);
+ rreq->len = readahead_length(ractl);
+
+ trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
+ netfs_read_trace_expanded);
+ }
+}
+
+/**
+ * netfs_readahead - Helper to manage a read request
+ * @ractl: The description of the readahead request
+ *
+ * Fulfil a readahead request by drawing data from the cache if possible, or
+ * the netfs if not. Space beyond the EOF is zero-filled. Multiple I/O
+ * requests from different sources will get munged together. If necessary, the
+ * readahead window can be expanded in either direction to a more convenient
+ * alighment for RPC efficiency or to make storage in the cache feasible.
+ *
+ * The calling netfs must initialise a netfs context contiguous to the vfs
+ * inode before calling this.
+ *
+ * This is usable whether or not caching is enabled.
+ */
+void netfs_readahead(struct readahead_control *ractl)
+{
+ struct netfs_io_request *rreq;
+ struct netfs_i_context *ctx = netfs_i_context(ractl->mapping->host);
+ int ret;
+
+ _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
+
+ if (readahead_count(ractl) == 0)
+ return;
+
+ rreq = netfs_alloc_request(ractl->mapping, ractl->file,
+ readahead_pos(ractl),
+ readahead_length(ractl),
+ NETFS_READAHEAD);
+ if (IS_ERR(rreq))
+ return;
+
+ if (ctx->ops->begin_cache_operation) {
+ ret = ctx->ops->begin_cache_operation(rreq);
+ if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+ goto cleanup_free;
+ }
+
+ netfs_stat(&netfs_n_rh_readahead);
+ trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
+ netfs_read_trace_readahead);
+
+ netfs_rreq_expand(rreq, ractl);
+
+ /* Drop the refs on the folios here rather than in the cache or
+ * filesystem. The locks will be dropped in netfs_rreq_unlock().
+ */
+ while (readahead_folio(ractl))
+ ;
+
+ netfs_begin_read(rreq, false);
+ return;
+
+cleanup_free:
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
+ return;
+}
+EXPORT_SYMBOL(netfs_readahead);
+
+/**
+ * netfs_readpage - Helper to manage a readpage request
+ * @file: The file to read from
+ * @subpage: A subpage of the folio to read
+ *
+ * Fulfil a readpage request by drawing data from the cache if possible, or the
+ * netfs if not. Space beyond the EOF is zero-filled. Multiple I/O requests
+ * from different sources will get munged together.
+ *
+ * The calling netfs must initialise a netfs context contiguous to the vfs
+ * inode before calling this.
+ *
+ * This is usable whether or not caching is enabled.
+ */
+int netfs_readpage(struct file *file, struct page *subpage)
+{
+ struct folio *folio = page_folio(subpage);
+ struct address_space *mapping = folio_file_mapping(folio);
+ struct netfs_io_request *rreq;
+ struct netfs_i_context *ctx = netfs_i_context(mapping->host);
+ int ret;
+
+ _enter("%lx", folio_index(folio));
+
+ rreq = netfs_alloc_request(mapping, file,
+ folio_file_pos(folio), folio_size(folio),
+ NETFS_READPAGE);
+ if (IS_ERR(rreq)) {
+ ret = PTR_ERR(rreq);
+ goto alloc_error;
+ }
+
+ if (ctx->ops->begin_cache_operation) {
+ ret = ctx->ops->begin_cache_operation(rreq);
+ if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+ goto discard;
+ }
+
+ netfs_stat(&netfs_n_rh_readpage);
+ trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
+ return netfs_begin_read(rreq, true);
+
+discard:
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
+alloc_error:
+ folio_unlock(folio);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_readpage);
+
+/*
+ * Prepare a folio for writing without reading first
+ * @folio: The folio being prepared
+ * @pos: starting position for the write
+ * @len: length of write
+ * @always_fill: T if the folio should always be completely filled/cleared
+ *
+ * In some cases, write_begin doesn't need to read at all:
+ * - full folio write
+ * - write that lies in a folio that is completely beyond EOF
+ * - write that covers the folio from start to EOF or beyond it
+ *
+ * If any of these criteria are met, then zero out the unwritten parts
+ * of the folio and return true. Otherwise, return false.
+ */
+static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len,
+ bool always_fill)
+{
+ struct inode *inode = folio_inode(folio);
+ loff_t i_size = i_size_read(inode);
+ size_t offset = offset_in_folio(folio, pos);
+ size_t plen = folio_size(folio);
+
+ if (unlikely(always_fill)) {
+ if (pos - offset + len <= i_size)
+ return false; /* Page entirely before EOF */
+ zero_user_segment(&folio->page, 0, plen);
+ folio_mark_uptodate(folio);
+ return true;
+ }
+
+ /* Full folio write */
+ if (offset == 0 && len >= plen)
+ return true;
+
+ /* Page entirely beyond the end of the file */
+ if (pos - offset >= i_size)
+ goto zero_out;
+
+ /* Write that covers from the start of the folio to EOF or beyond */
+ if (offset == 0 && (pos + len) >= i_size)
+ goto zero_out;
+
+ return false;
+zero_out:
+ zero_user_segments(&folio->page, 0, offset, offset + len, plen);
+ return true;
+}
+
+/**
+ * netfs_write_begin - Helper to prepare for writing
+ * @file: The file to read from
+ * @mapping: The mapping to read from
+ * @pos: File position at which the write will begin
+ * @len: The length of the write (may extend beyond the end of the folio chosen)
+ * @aop_flags: AOP_* flags
+ * @_folio: Where to put the resultant folio
+ * @_fsdata: Place for the netfs to store a cookie
+ *
+ * Pre-read data for a write-begin request by drawing data from the cache if
+ * possible, or the netfs if not. Space beyond the EOF is zero-filled.
+ * Multiple I/O requests from different sources will get munged together. If
+ * necessary, the readahead window can be expanded in either direction to a
+ * more convenient alighment for RPC efficiency or to make storage in the cache
+ * feasible.
+ *
+ * The calling netfs must provide a table of operations, only one of which,
+ * issue_op, is mandatory.
+ *
+ * The check_write_begin() operation can be provided to check for and flush
+ * conflicting writes once the folio is grabbed and locked. It is passed a
+ * pointer to the fsdata cookie that gets returned to the VM to be passed to
+ * write_end. It is permitted to sleep. It should return 0 if the request
+ * should go ahead; unlock the folio and return -EAGAIN to cause the folio to
+ * be regot; or return an error.
+ *
+ * The calling netfs must initialise a netfs context contiguous to the vfs
+ * inode before calling this.
+ *
+ * This is usable whether or not caching is enabled.
+ */
+int netfs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned int len, unsigned int aop_flags,
+ struct folio **_folio, void **_fsdata)
+{
+ struct netfs_io_request *rreq;
+ struct netfs_i_context *ctx = netfs_i_context(file_inode(file ));
+ struct folio *folio;
+ unsigned int fgp_flags;
+ pgoff_t index = pos >> PAGE_SHIFT;
+ int ret;
+
+ DEFINE_READAHEAD(ractl, file, NULL, mapping, index);
+
+retry:
+ fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
+ if (aop_flags & AOP_FLAG_NOFS)
+ fgp_flags |= FGP_NOFS;
+ folio = __filemap_get_folio(mapping, index, fgp_flags,
+ mapping_gfp_mask(mapping));
+ if (!folio)
+ return -ENOMEM;
+
+ if (ctx->ops->check_write_begin) {
+ /* Allow the netfs (eg. ceph) to flush conflicts. */
+ ret = ctx->ops->check_write_begin(file, pos, len, folio, _fsdata);
+ if (ret < 0) {
+ trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin);
+ if (ret == -EAGAIN)
+ goto retry;
+ goto error;
+ }
+ }
+
+ if (folio_test_uptodate(folio))
+ goto have_folio;
+
+ /* If the page is beyond the EOF, we want to clear it - unless it's
+ * within the cache granule containing the EOF, in which case we need
+ * to preload the granule.
+ */
+ if (!netfs_is_cache_enabled(ctx) &&
+ netfs_skip_folio_read(folio, pos, len, false)) {
+ netfs_stat(&netfs_n_rh_write_zskip);
+ goto have_folio_no_wait;
+ }
+
+ rreq = netfs_alloc_request(mapping, file,
+ folio_file_pos(folio), folio_size(folio),
+ NETFS_READ_FOR_WRITE);
+ if (IS_ERR(rreq)) {
+ ret = PTR_ERR(rreq);
+ goto error;
+ }
+ rreq->no_unlock_folio = folio_index(folio);
+ __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
+
+ if (ctx->ops->begin_cache_operation) {
+ ret = ctx->ops->begin_cache_operation(rreq);
+ if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+ goto error_put;
+ }
+
+ netfs_stat(&netfs_n_rh_write_begin);
+ trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);
+
+ /* Expand the request to meet caching requirements and download
+ * preferences.
+ */
+ ractl._nr_pages = folio_nr_pages(folio);
+ netfs_rreq_expand(rreq, &ractl);
+
+ /* We hold the folio locks, so we can drop the references */
+ folio_get(folio);
+ while (readahead_folio(&ractl))
+ ;
+
+ ret = netfs_begin_read(rreq, true);
+ if (ret < 0)
+ goto error;
+
+have_folio:
+ ret = folio_wait_fscache_killable(folio);
+ if (ret < 0)
+ goto error;
+have_folio_no_wait:
+ *_folio = folio;
+ _leave(" = 0");
+ return 0;
+
+error_put:
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
+error:
+ folio_unlock(folio);
+ folio_put(folio);
+ _leave(" = %d", ret);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_write_begin);
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index b7f2c4459f33..b7b0e3d18d9e 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -5,6 +5,10 @@
* Written by David Howells (dhowells@redhat.com)
*/
+#include <linux/netfs.h>
+#include <linux/fscache.h>
+#include <trace/events/netfs.h>
+
#ifdef pr_fmt
#undef pr_fmt
#endif
@@ -12,11 +16,40 @@
#define pr_fmt(fmt) "netfs: " fmt
/*
- * read_helper.c
+ * buffered_read.c
+ */
+void netfs_rreq_unlock_folios(struct netfs_io_request *rreq);
+
+/*
+ * io.c
+ */
+int netfs_begin_read(struct netfs_io_request *rreq, bool sync);
+
+/*
+ * main.c
*/
extern unsigned int netfs_debug;
/*
+ * objects.c
+ */
+struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
+ struct file *file,
+ loff_t start, size_t len,
+ enum netfs_io_origin origin);
+void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what);
+void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async);
+void netfs_put_request(struct netfs_io_request *rreq, bool was_async,
+ enum netfs_rreq_ref_trace what);
+struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq);
+
+static inline void netfs_see_request(struct netfs_io_request *rreq,
+ enum netfs_rreq_ref_trace what)
+{
+ trace_netfs_rreq_ref(rreq->debug_id, refcount_read(&rreq->ref), what);
+}
+
+/*
* stats.c
*/
#ifdef CONFIG_NETFS_STATS
@@ -55,6 +88,21 @@ static inline void netfs_stat_d(atomic_t *stat)
#define netfs_stat_d(x) do {} while(0)
#endif
+/*
+ * Miscellaneous functions.
+ */
+static inline bool netfs_is_cache_enabled(struct netfs_i_context *ctx)
+{
+#if IS_ENABLED(CONFIG_FSCACHE)
+ struct fscache_cookie *cookie = ctx->cache;
+
+ return fscache_cookie_valid(cookie) && cookie->cache_priv &&
+ fscache_cookie_enabled(cookie);
+#else
+ return false;
+#endif
+}
+
/*****************************************************************************/
/*
* debug tracing
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
new file mode 100644
index 000000000000..428925899282
--- /dev/null
+++ b/fs/netfs/io.c
@@ -0,0 +1,657 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Network filesystem high-level read support.
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/uio.h>
+#include <linux/sched/mm.h>
+#include <linux/task_io_accounting_ops.h>
+#include "internal.h"
+
+/*
+ * Clear the unread part of an I/O request.
+ */
+static void netfs_clear_unread(struct netfs_io_subrequest *subreq)
+{
+ struct iov_iter iter;
+
+ iov_iter_xarray(&iter, READ, &subreq->rreq->mapping->i_pages,
+ subreq->start + subreq->transferred,
+ subreq->len - subreq->transferred);
+ iov_iter_zero(iov_iter_count(&iter), &iter);
+}
+
+static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error,
+ bool was_async)
+{
+ struct netfs_io_subrequest *subreq = priv;
+
+ netfs_subreq_terminated(subreq, transferred_or_error, was_async);
+}
+
+/*
+ * Issue a read against the cache.
+ * - Eats the caller's ref on subreq.
+ */
+static void netfs_read_from_cache(struct netfs_io_request *rreq,
+ struct netfs_io_subrequest *subreq,
+ enum netfs_read_from_hole read_hole)
+{
+ struct netfs_cache_resources *cres = &rreq->cache_resources;
+ struct iov_iter iter;
+
+ netfs_stat(&netfs_n_rh_read);
+ iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages,
+ subreq->start + subreq->transferred,
+ subreq->len - subreq->transferred);
+
+ cres->ops->read(cres, subreq->start, &iter, read_hole,
+ netfs_cache_read_terminated, subreq);
+}
+
+/*
+ * Fill a subrequest region with zeroes.
+ */
+static void netfs_fill_with_zeroes(struct netfs_io_request *rreq,
+ struct netfs_io_subrequest *subreq)
+{
+ netfs_stat(&netfs_n_rh_zero);
+ __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+ netfs_subreq_terminated(subreq, 0, false);
+}
+
+/*
+ * Ask the netfs to issue a read request to the server for us.
+ *
+ * The netfs is expected to read from subreq->pos + subreq->transferred to
+ * subreq->pos + subreq->len - 1. It may not backtrack and write data into the
+ * buffer prior to the transferred point as it might clobber dirty data
+ * obtained from the cache.
+ *
+ * Alternatively, the netfs is allowed to indicate one of two things:
+ *
+ * - NETFS_SREQ_SHORT_READ: A short read - it will get called again to try and
+ * make progress.
+ *
+ * - NETFS_SREQ_CLEAR_TAIL: A short read - the rest of the buffer will be
+ * cleared.
+ */
+static void netfs_read_from_server(struct netfs_io_request *rreq,
+ struct netfs_io_subrequest *subreq)
+{
+ netfs_stat(&netfs_n_rh_download);
+ rreq->netfs_ops->issue_read(subreq);
+}
+
+/*
+ * Release those waiting.
+ */
+static void netfs_rreq_completed(struct netfs_io_request *rreq, bool was_async)
+{
+ trace_netfs_rreq(rreq, netfs_rreq_trace_done);
+ netfs_clear_subrequests(rreq, was_async);
+ netfs_put_request(rreq, was_async, netfs_rreq_trace_put_complete);
+}
+
+/*
+ * Deal with the completion of writing the data to the cache. We have to clear
+ * the PG_fscache bits on the folios involved and release the caller's ref.
+ *
+ * May be called in softirq mode and we inherit a ref from the caller.
+ */
+static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq,
+ bool was_async)
+{
+ struct netfs_io_subrequest *subreq;
+ struct folio *folio;
+ pgoff_t unlocked = 0;
+ bool have_unlocked = false;
+
+ rcu_read_lock();
+
+ list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+ XA_STATE(xas, &rreq->mapping->i_pages, subreq->start / PAGE_SIZE);
+
+ xas_for_each(&xas, folio, (subreq->start + subreq->len - 1) / PAGE_SIZE) {
+ /* We might have multiple writes from the same huge
+ * folio, but we mustn't unlock a folio more than once.
+ */
+ if (have_unlocked && folio_index(folio) <= unlocked)
+ continue;
+ unlocked = folio_index(folio);
+ folio_end_fscache(folio);
+ have_unlocked = true;
+ }
+ }
+
+ rcu_read_unlock();
+ netfs_rreq_completed(rreq, was_async);
+}
+
+static void netfs_rreq_copy_terminated(void *priv, ssize_t transferred_or_error,
+ bool was_async)
+{
+ struct netfs_io_subrequest *subreq = priv;
+ struct netfs_io_request *rreq = subreq->rreq;
+
+ if (IS_ERR_VALUE(transferred_or_error)) {
+ netfs_stat(&netfs_n_rh_write_failed);
+ trace_netfs_failure(rreq, subreq, transferred_or_error,
+ netfs_fail_copy_to_cache);
+ } else {
+ netfs_stat(&netfs_n_rh_write_done);
+ }
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_write_term);
+
+ /* If we decrement nr_copy_ops to 0, the ref belongs to us. */
+ if (atomic_dec_and_test(&rreq->nr_copy_ops))
+ netfs_rreq_unmark_after_write(rreq, was_async);
+
+ netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
+}
+
+/*
+ * Perform any outstanding writes to the cache. We inherit a ref from the
+ * caller.
+ */
+static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq)
+{
+ struct netfs_cache_resources *cres = &rreq->cache_resources;
+ struct netfs_io_subrequest *subreq, *next, *p;
+ struct iov_iter iter;
+ int ret;
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_copy);
+
+ /* We don't want terminating writes trying to wake us up whilst we're
+ * still going through the list.
+ */
+ atomic_inc(&rreq->nr_copy_ops);
+
+ list_for_each_entry_safe(subreq, p, &rreq->subrequests, rreq_link) {
+ if (!test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
+ list_del_init(&subreq->rreq_link);
+ netfs_put_subrequest(subreq, false,
+ netfs_sreq_trace_put_no_copy);
+ }
+ }
+
+ list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+ /* Amalgamate adjacent writes */
+ while (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
+ next = list_next_entry(subreq, rreq_link);
+ if (next->start != subreq->start + subreq->len)
+ break;
+ subreq->len += next->len;
+ list_del_init(&next->rreq_link);
+ netfs_put_subrequest(next, false,
+ netfs_sreq_trace_put_merged);
+ }
+
+ ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len,
+ rreq->i_size, true);
+ if (ret < 0) {
+ trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip);
+ continue;
+ }
+
+ iov_iter_xarray(&iter, WRITE, &rreq->mapping->i_pages,
+ subreq->start, subreq->len);
+
+ atomic_inc(&rreq->nr_copy_ops);
+ netfs_stat(&netfs_n_rh_write);
+ netfs_get_subrequest(subreq, netfs_sreq_trace_get_copy_to_cache);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_write);
+ cres->ops->write(cres, subreq->start, &iter,
+ netfs_rreq_copy_terminated, subreq);
+ }
+
+ /* If we decrement nr_copy_ops to 0, the usage ref belongs to us. */
+ if (atomic_dec_and_test(&rreq->nr_copy_ops))
+ netfs_rreq_unmark_after_write(rreq, false);
+}
+
+static void netfs_rreq_write_to_cache_work(struct work_struct *work)
+{
+ struct netfs_io_request *rreq =
+ container_of(work, struct netfs_io_request, work);
+
+ netfs_rreq_do_write_to_cache(rreq);
+}
+
+static void netfs_rreq_write_to_cache(struct netfs_io_request *rreq)
+{
+ rreq->work.func = netfs_rreq_write_to_cache_work;
+ if (!queue_work(system_unbound_wq, &rreq->work))
+ BUG();
+}
+
+/*
+ * Handle a short read.
+ */
+static void netfs_rreq_short_read(struct netfs_io_request *rreq,
+ struct netfs_io_subrequest *subreq)
+{
+ __clear_bit(NETFS_SREQ_SHORT_IO, &subreq->flags);
+ __set_bit(NETFS_SREQ_SEEK_DATA_READ, &subreq->flags);
+
+ netfs_stat(&netfs_n_rh_short_read);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_resubmit_short);
+
+ netfs_get_subrequest(subreq, netfs_sreq_trace_get_short_read);
+ atomic_inc(&rreq->nr_outstanding);
+ if (subreq->source == NETFS_READ_FROM_CACHE)
+ netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_CLEAR);
+ else
+ netfs_read_from_server(rreq, subreq);
+}
+
+/*
+ * Resubmit any short or failed operations. Returns true if we got the rreq
+ * ref back.
+ */
+static bool netfs_rreq_perform_resubmissions(struct netfs_io_request *rreq)
+{
+ struct netfs_io_subrequest *subreq;
+
+ WARN_ON(in_interrupt());
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit);
+
+ /* We don't want terminating submissions trying to wake us up whilst
+ * we're still going through the list.
+ */
+ atomic_inc(&rreq->nr_outstanding);
+
+ __clear_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
+ list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+ if (subreq->error) {
+ if (subreq->source != NETFS_READ_FROM_CACHE)
+ break;
+ subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
+ subreq->error = 0;
+ netfs_stat(&netfs_n_rh_download_instead);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead);
+ netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
+ atomic_inc(&rreq->nr_outstanding);
+ netfs_read_from_server(rreq, subreq);
+ } else if (test_bit(NETFS_SREQ_SHORT_IO, &subreq->flags)) {
+ netfs_rreq_short_read(rreq, subreq);
+ }
+ }
+
+ /* If we decrement nr_outstanding to 0, the usage ref belongs to us. */
+ if (atomic_dec_and_test(&rreq->nr_outstanding))
+ return true;
+
+ wake_up_var(&rreq->nr_outstanding);
+ return false;
+}
+
+/*
+ * Check to see if the data read is still valid.
+ */
+static void netfs_rreq_is_still_valid(struct netfs_io_request *rreq)
+{
+ struct netfs_io_subrequest *subreq;
+
+ if (!rreq->netfs_ops->is_still_valid ||
+ rreq->netfs_ops->is_still_valid(rreq))
+ return;
+
+ list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+ if (subreq->source == NETFS_READ_FROM_CACHE) {
+ subreq->error = -ESTALE;
+ __set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
+ }
+ }
+}
+
+/*
+ * Assess the state of a read request and decide what to do next.
+ *
+ * Note that we could be in an ordinary kernel thread, on a workqueue or in
+ * softirq context at this point. We inherit a ref from the caller.
+ */
+static void netfs_rreq_assess(struct netfs_io_request *rreq, bool was_async)
+{
+ trace_netfs_rreq(rreq, netfs_rreq_trace_assess);
+
+again:
+ netfs_rreq_is_still_valid(rreq);
+
+ if (!test_bit(NETFS_RREQ_FAILED, &rreq->flags) &&
+ test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags)) {
+ if (netfs_rreq_perform_resubmissions(rreq))
+ goto again;
+ return;
+ }
+
+ netfs_rreq_unlock_folios(rreq);
+
+ clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
+ wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
+
+ if (test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags))
+ return netfs_rreq_write_to_cache(rreq);
+
+ netfs_rreq_completed(rreq, was_async);
+}
+
+static void netfs_rreq_work(struct work_struct *work)
+{
+ struct netfs_io_request *rreq =
+ container_of(work, struct netfs_io_request, work);
+ netfs_rreq_assess(rreq, false);
+}
+
+/*
+ * Handle the completion of all outstanding I/O operations on a read request.
+ * We inherit a ref from the caller.
+ */
+static void netfs_rreq_terminated(struct netfs_io_request *rreq,
+ bool was_async)
+{
+ if (test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags) &&
+ was_async) {
+ if (!queue_work(system_unbound_wq, &rreq->work))
+ BUG();
+ } else {
+ netfs_rreq_assess(rreq, was_async);
+ }
+}
+
+/**
+ * netfs_subreq_terminated - Note the termination of an I/O operation.
+ * @subreq: The I/O request that has terminated.
+ * @transferred_or_error: The amount of data transferred or an error code.
+ * @was_async: The termination was asynchronous
+ *
+ * This tells the read helper that a contributory I/O operation has terminated,
+ * one way or another, and that it should integrate the results.
+ *
+ * The caller indicates in @transferred_or_error the outcome of the operation,
+ * supplying a positive value to indicate the number of bytes transferred, 0 to
+ * indicate a failure to transfer anything that should be retried or a negative
+ * error code. The helper will look after reissuing I/O operations as
+ * appropriate and writing downloaded data to the cache.
+ *
+ * If @was_async is true, the caller might be running in softirq or interrupt
+ * context and we can't sleep.
+ */
+void netfs_subreq_terminated(struct netfs_io_subrequest *subreq,
+ ssize_t transferred_or_error,
+ bool was_async)
+{
+ struct netfs_io_request *rreq = subreq->rreq;
+ int u;
+
+ _enter("[%u]{%llx,%lx},%zd",
+ subreq->debug_index, subreq->start, subreq->flags,
+ transferred_or_error);
+
+ switch (subreq->source) {
+ case NETFS_READ_FROM_CACHE:
+ netfs_stat(&netfs_n_rh_read_done);
+ break;
+ case NETFS_DOWNLOAD_FROM_SERVER:
+ netfs_stat(&netfs_n_rh_download_done);
+ break;
+ default:
+ break;
+ }
+
+ if (IS_ERR_VALUE(transferred_or_error)) {
+ subreq->error = transferred_or_error;
+ trace_netfs_failure(rreq, subreq, transferred_or_error,
+ netfs_fail_read);
+ goto failed;
+ }
+
+ if (WARN(transferred_or_error > subreq->len - subreq->transferred,
+ "Subreq overread: R%x[%x] %zd > %zu - %zu",
+ rreq->debug_id, subreq->debug_index,
+ transferred_or_error, subreq->len, subreq->transferred))
+ transferred_or_error = subreq->len - subreq->transferred;
+
+ subreq->error = 0;
+ subreq->transferred += transferred_or_error;
+ if (subreq->transferred < subreq->len)
+ goto incomplete;
+
+complete:
+ __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
+ if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags))
+ set_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags);
+
+out:
+ trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
+
+ /* If we decrement nr_outstanding to 0, the ref belongs to us. */
+ u = atomic_dec_return(&rreq->nr_outstanding);
+ if (u == 0)
+ netfs_rreq_terminated(rreq, was_async);
+ else if (u == 1)
+ wake_up_var(&rreq->nr_outstanding);
+
+ netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
+ return;
+
+incomplete:
+ if (test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags)) {
+ netfs_clear_unread(subreq);
+ subreq->transferred = subreq->len;
+ goto complete;
+ }
+
+ if (transferred_or_error == 0) {
+ if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) {
+ subreq->error = -ENODATA;
+ goto failed;
+ }
+ } else {
+ __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
+ }
+
+ __set_bit(NETFS_SREQ_SHORT_IO, &subreq->flags);
+ set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
+ goto out;
+
+failed:
+ if (subreq->source == NETFS_READ_FROM_CACHE) {
+ netfs_stat(&netfs_n_rh_read_failed);
+ set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
+ } else {
+ netfs_stat(&netfs_n_rh_download_failed);
+ set_bit(NETFS_RREQ_FAILED, &rreq->flags);
+ rreq->error = subreq->error;
+ }
+ goto out;
+}
+EXPORT_SYMBOL(netfs_subreq_terminated);
+
+static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_subrequest *subreq,
+ loff_t i_size)
+{
+ struct netfs_io_request *rreq = subreq->rreq;
+ struct netfs_cache_resources *cres = &rreq->cache_resources;
+
+ if (cres->ops)
+ return cres->ops->prepare_read(subreq, i_size);
+ if (subreq->start >= rreq->i_size)
+ return NETFS_FILL_WITH_ZEROES;
+ return NETFS_DOWNLOAD_FROM_SERVER;
+}
+
+/*
+ * Work out what sort of subrequest the next one will be.
+ */
+static enum netfs_io_source
+netfs_rreq_prepare_read(struct netfs_io_request *rreq,
+ struct netfs_io_subrequest *subreq)
+{
+ enum netfs_io_source source;
+
+ _enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size);
+
+ source = netfs_cache_prepare_read(subreq, rreq->i_size);
+ if (source == NETFS_INVALID_READ)
+ goto out;
+
+ if (source == NETFS_DOWNLOAD_FROM_SERVER) {
+ /* Call out to the netfs to let it shrink the request to fit
+ * its own I/O sizes and boundaries. If it shinks it here, it
+ * will be called again to make simultaneous calls; if it wants
+ * to make serial calls, it can indicate a short read and then
+ * we will call it again.
+ */
+ if (subreq->len > rreq->i_size - subreq->start)
+ subreq->len = rreq->i_size - subreq->start;
+
+ if (rreq->netfs_ops->clamp_length &&
+ !rreq->netfs_ops->clamp_length(subreq)) {
+ source = NETFS_INVALID_READ;
+ goto out;
+ }
+ }
+
+ if (WARN_ON(subreq->len == 0))
+ source = NETFS_INVALID_READ;
+
+out:
+ subreq->source = source;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
+ return source;
+}
+
+/*
+ * Slice off a piece of a read request and submit an I/O request for it.
+ */
+static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
+ unsigned int *_debug_index)
+{
+ struct netfs_io_subrequest *subreq;
+ enum netfs_io_source source;
+
+ subreq = netfs_alloc_subrequest(rreq);
+ if (!subreq)
+ return false;
+
+ subreq->debug_index = (*_debug_index)++;
+ subreq->start = rreq->start + rreq->submitted;
+ subreq->len = rreq->len - rreq->submitted;
+
+ _debug("slice %llx,%zx,%zx", subreq->start, subreq->len, rreq->submitted);
+ list_add_tail(&subreq->rreq_link, &rreq->subrequests);
+
+ /* Call out to the cache to find out what it can do with the remaining
+ * subset. It tells us in subreq->flags what it decided should be done
+ * and adjusts subreq->len down if the subset crosses a cache boundary.
+ *
+ * Then when we hand the subset, it can choose to take a subset of that
+ * (the starts must coincide), in which case, we go around the loop
+ * again and ask it to download the next piece.
+ */
+ source = netfs_rreq_prepare_read(rreq, subreq);
+ if (source == NETFS_INVALID_READ)
+ goto subreq_failed;
+
+ atomic_inc(&rreq->nr_outstanding);
+
+ rreq->submitted += subreq->len;
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+ switch (source) {
+ case NETFS_FILL_WITH_ZEROES:
+ netfs_fill_with_zeroes(rreq, subreq);
+ break;
+ case NETFS_DOWNLOAD_FROM_SERVER:
+ netfs_read_from_server(rreq, subreq);
+ break;
+ case NETFS_READ_FROM_CACHE:
+ netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_IGNORE);
+ break;
+ default:
+ BUG();
+ }
+
+ return true;
+
+subreq_failed:
+ rreq->error = subreq->error;
+ netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_failed);
+ return false;
+}
+
+/*
+ * Begin the process of reading in a chunk of data, where that data may be
+ * stitched together from multiple sources, including multiple servers and the
+ * local cache.
+ */
+int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
+{
+ unsigned int debug_index = 0;
+ int ret;
+
+ _enter("R=%x %llx-%llx",
+ rreq->debug_id, rreq->start, rreq->start + rreq->len - 1);
+
+ if (rreq->len == 0) {
+ pr_err("Zero-sized read [R=%x]\n", rreq->debug_id);
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_zero_len);
+ return -EIO;
+ }
+
+ INIT_WORK(&rreq->work, netfs_rreq_work);
+
+ if (sync)
+ netfs_get_request(rreq, netfs_rreq_trace_get_hold);
+
+ /* Chop the read into slices according to what the cache and the netfs
+ * want and submit each one.
+ */
+ atomic_set(&rreq->nr_outstanding, 1);
+ do {
+ if (!netfs_rreq_submit_slice(rreq, &debug_index))
+ break;
+
+ } while (rreq->submitted < rreq->len);
+
+ if (sync) {
+ /* Keep nr_outstanding incremented so that the ref always belongs to
+ * us, and the service code isn't punted off to a random thread pool to
+ * process.
+ */
+ for (;;) {
+ wait_var_event(&rreq->nr_outstanding,
+ atomic_read(&rreq->nr_outstanding) == 1);
+ netfs_rreq_assess(rreq, false);
+ if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags))
+ break;
+ cond_resched();
+ }
+
+ ret = rreq->error;
+ if (ret == 0 && rreq->submitted < rreq->len) {
+ trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
+ ret = -EIO;
+ }
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_hold);
+ } else {
+ /* If we decrement nr_outstanding to 0, the ref belongs to us. */
+ if (atomic_dec_and_test(&rreq->nr_outstanding))
+ netfs_rreq_assess(rreq, false);
+ ret = 0;
+ }
+ return ret;
+}
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
new file mode 100644
index 000000000000..068568702957
--- /dev/null
+++ b/fs/netfs/main.c
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Miscellaneous bits for the netfs support library.
+ *
+ * Copyright (C) 2022 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/module.h>
+#include <linux/export.h>
+#include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/netfs.h>
+
+MODULE_DESCRIPTION("Network fs support");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+unsigned netfs_debug;
+module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask");
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
new file mode 100644
index 000000000000..e86107b30ba4
--- /dev/null
+++ b/fs/netfs/objects.c
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Object lifetime handling and tracing.
+ *
+ * Copyright (C) 2022 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/slab.h>
+#include "internal.h"
+
+/*
+ * Allocate an I/O request and initialise it.
+ */
+struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
+ struct file *file,
+ loff_t start, size_t len,
+ enum netfs_io_origin origin)
+{
+ static atomic_t debug_ids;
+ struct inode *inode = file ? file_inode(file) : mapping->host;
+ struct netfs_i_context *ctx = netfs_i_context(inode);
+ struct netfs_io_request *rreq;
+ int ret;
+
+ rreq = kzalloc(sizeof(struct netfs_io_request), GFP_KERNEL);
+ if (!rreq)
+ return ERR_PTR(-ENOMEM);
+
+ rreq->start = start;
+ rreq->len = len;
+ rreq->origin = origin;
+ rreq->netfs_ops = ctx->ops;
+ rreq->mapping = mapping;
+ rreq->inode = inode;
+ rreq->i_size = i_size_read(inode);
+ rreq->debug_id = atomic_inc_return(&debug_ids);
+ INIT_LIST_HEAD(&rreq->subrequests);
+ refcount_set(&rreq->ref, 1);
+ __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
+ if (rreq->netfs_ops->init_request) {
+ ret = rreq->netfs_ops->init_request(rreq, file);
+ if (ret < 0) {
+ kfree(rreq);
+ return ERR_PTR(ret);
+ }
+ }
+
+ netfs_stat(&netfs_n_rh_rreq);
+ return rreq;
+}
+
+void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what)
+{
+ int r;
+
+ __refcount_inc(&rreq->ref, &r);
+ trace_netfs_rreq_ref(rreq->debug_id, r + 1, what);
+}
+
+void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async)
+{
+ struct netfs_io_subrequest *subreq;
+
+ while (!list_empty(&rreq->subrequests)) {
+ subreq = list_first_entry(&rreq->subrequests,
+ struct netfs_io_subrequest, rreq_link);
+ list_del(&subreq->rreq_link);
+ netfs_put_subrequest(subreq, was_async,
+ netfs_sreq_trace_put_clear);
+ }
+}
+
+static void netfs_free_request(struct work_struct *work)
+{
+ struct netfs_io_request *rreq =
+ container_of(work, struct netfs_io_request, work);
+
+ netfs_clear_subrequests(rreq, false);
+ if (rreq->netfs_priv)
+ rreq->netfs_ops->cleanup(rreq->mapping, rreq->netfs_priv);
+ trace_netfs_rreq(rreq, netfs_rreq_trace_free);
+ if (rreq->cache_resources.ops)
+ rreq->cache_resources.ops->end_operation(&rreq->cache_resources);
+ kfree(rreq);
+ netfs_stat_d(&netfs_n_rh_rreq);
+}
+
+void netfs_put_request(struct netfs_io_request *rreq, bool was_async,
+ enum netfs_rreq_ref_trace what)
+{
+ unsigned int debug_id = rreq->debug_id;
+ bool dead;
+ int r;
+
+ dead = __refcount_dec_and_test(&rreq->ref, &r);
+ trace_netfs_rreq_ref(debug_id, r - 1, what);
+ if (dead) {
+ if (was_async) {
+ rreq->work.func = netfs_free_request;
+ if (!queue_work(system_unbound_wq, &rreq->work))
+ BUG();
+ } else {
+ netfs_free_request(&rreq->work);
+ }
+ }
+}
+
+/*
+ * Allocate and partially initialise an I/O request structure.
+ */
+struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq)
+{
+ struct netfs_io_subrequest *subreq;
+
+ subreq = kzalloc(sizeof(struct netfs_io_subrequest), GFP_KERNEL);
+ if (subreq) {
+ INIT_LIST_HEAD(&subreq->rreq_link);
+ refcount_set(&subreq->ref, 2);
+ subreq->rreq = rreq;
+ netfs_get_request(rreq, netfs_rreq_trace_get_subreq);
+ netfs_stat(&netfs_n_rh_sreq);
+ }
+
+ return subreq;
+}
+
+void netfs_get_subrequest(struct netfs_io_subrequest *subreq,
+ enum netfs_sreq_ref_trace what)
+{
+ int r;
+
+ __refcount_inc(&subreq->ref, &r);
+ trace_netfs_sreq_ref(subreq->rreq->debug_id, subreq->debug_index, r + 1,
+ what);
+}
+
+static void netfs_free_subrequest(struct netfs_io_subrequest *subreq,
+ bool was_async)
+{
+ struct netfs_io_request *rreq = subreq->rreq;
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_free);
+ kfree(subreq);
+ netfs_stat_d(&netfs_n_rh_sreq);
+ netfs_put_request(rreq, was_async, netfs_rreq_trace_put_subreq);
+}
+
+void netfs_put_subrequest(struct netfs_io_subrequest *subreq, bool was_async,
+ enum netfs_sreq_ref_trace what)
+{
+ unsigned int debug_index = subreq->debug_index;
+ unsigned int debug_id = subreq->rreq->debug_id;
+ bool dead;
+ int r;
+
+ dead = __refcount_dec_and_test(&subreq->ref, &r);
+ trace_netfs_sreq_ref(debug_id, debug_index, r - 1, what);
+ if (dead)
+ netfs_free_subrequest(subreq, was_async);
+}
diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c
deleted file mode 100644
index 501da990c259..000000000000
--- a/fs/netfs/read_helper.c
+++ /dev/null
@@ -1,1205 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/* Network filesystem high-level read support.
- *
- * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#include <linux/module.h>
-#include <linux/export.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/slab.h>
-#include <linux/uio.h>
-#include <linux/sched/mm.h>
-#include <linux/task_io_accounting_ops.h>
-#include <linux/netfs.h>
-#include "internal.h"
-#define CREATE_TRACE_POINTS
-#include <trace/events/netfs.h>
-
-MODULE_DESCRIPTION("Network fs support");
-MODULE_AUTHOR("Red Hat, Inc.");
-MODULE_LICENSE("GPL");
-
-unsigned netfs_debug;
-module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask");
-
-static void netfs_rreq_work(struct work_struct *);
-static void __netfs_put_subrequest(struct netfs_read_subrequest *, bool);
-
-static void netfs_put_subrequest(struct netfs_read_subrequest *subreq,
- bool was_async)
-{
- if (refcount_dec_and_test(&subreq->usage))
- __netfs_put_subrequest(subreq, was_async);
-}
-
-static struct netfs_read_request *netfs_alloc_read_request(
- const struct netfs_read_request_ops *ops, void *netfs_priv,
- struct file *file)
-{
- static atomic_t debug_ids;
- struct netfs_read_request *rreq;
-
- rreq = kzalloc(sizeof(struct netfs_read_request), GFP_KERNEL);
- if (rreq) {
- rreq->netfs_ops = ops;
- rreq->netfs_priv = netfs_priv;
- rreq->inode = file_inode(file);
- rreq->i_size = i_size_read(rreq->inode);
- rreq->debug_id = atomic_inc_return(&debug_ids);
- INIT_LIST_HEAD(&rreq->subrequests);
- INIT_WORK(&rreq->work, netfs_rreq_work);
- refcount_set(&rreq->usage, 1);
- __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
- if (ops->init_rreq)
- ops->init_rreq(rreq, file);
- netfs_stat(&netfs_n_rh_rreq);
- }
-
- return rreq;
-}
-
-static void netfs_get_read_request(struct netfs_read_request *rreq)
-{
- refcount_inc(&rreq->usage);
-}
-
-static void netfs_rreq_clear_subreqs(struct netfs_read_request *rreq,
- bool was_async)
-{
- struct netfs_read_subrequest *subreq;
-
- while (!list_empty(&rreq->subrequests)) {
- subreq = list_first_entry(&rreq->subrequests,
- struct netfs_read_subrequest, rreq_link);
- list_del(&subreq->rreq_link);
- netfs_put_subrequest(subreq, was_async);
- }
-}
-
-static void netfs_free_read_request(struct work_struct *work)
-{
- struct netfs_read_request *rreq =
- container_of(work, struct netfs_read_request, work);
- netfs_rreq_clear_subreqs(rreq, false);
- if (rreq->netfs_priv)
- rreq->netfs_ops->cleanup(rreq->mapping, rreq->netfs_priv);
- trace_netfs_rreq(rreq, netfs_rreq_trace_free);
- if (rreq->cache_resources.ops)
- rreq->cache_resources.ops->end_operation(&rreq->cache_resources);
- kfree(rreq);
- netfs_stat_d(&netfs_n_rh_rreq);
-}
-
-static void netfs_put_read_request(struct netfs_read_request *rreq, bool was_async)
-{
- if (refcount_dec_and_test(&rreq->usage)) {
- if (was_async) {
- rreq->work.func = netfs_free_read_request;
- if (!queue_work(system_unbound_wq, &rreq->work))
- BUG();
- } else {
- netfs_free_read_request(&rreq->work);
- }
- }
-}
-
-/*
- * Allocate and partially initialise an I/O request structure.
- */
-static struct netfs_read_subrequest *netfs_alloc_subrequest(
- struct netfs_read_request *rreq)
-{
- struct netfs_read_subrequest *subreq;
-
- subreq = kzalloc(sizeof(struct netfs_read_subrequest), GFP_KERNEL);
- if (subreq) {
- INIT_LIST_HEAD(&subreq->rreq_link);
- refcount_set(&subreq->usage, 2);
- subreq->rreq = rreq;
- netfs_get_read_request(rreq);
- netfs_stat(&netfs_n_rh_sreq);
- }
-
- return subreq;
-}
-
-static void netfs_get_read_subrequest(struct netfs_read_subrequest *subreq)
-{
- refcount_inc(&subreq->usage);
-}
-
-static void __netfs_put_subrequest(struct netfs_read_subrequest *subreq,
- bool was_async)
-{
- struct netfs_read_request *rreq = subreq->rreq;
-
- trace_netfs_sreq(subreq, netfs_sreq_trace_free);
- kfree(subreq);
- netfs_stat_d(&netfs_n_rh_sreq);
- netfs_put_read_request(rreq, was_async);
-}
-
-/*
- * Clear the unread part of an I/O request.
- */
-static void netfs_clear_unread(struct netfs_read_subrequest *subreq)
-{
- struct iov_iter iter;
-
- iov_iter_xarray(&iter, READ, &subreq->rreq->mapping->i_pages,
- subreq->start + subreq->transferred,
- subreq->len - subreq->transferred);
- iov_iter_zero(iov_iter_count(&iter), &iter);
-}
-
-static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error,
- bool was_async)
-{
- struct netfs_read_subrequest *subreq = priv;
-
- netfs_subreq_terminated(subreq, transferred_or_error, was_async);
-}
-
-/*
- * Issue a read against the cache.
- * - Eats the caller's ref on subreq.
- */
-static void netfs_read_from_cache(struct netfs_read_request *rreq,
- struct netfs_read_subrequest *subreq,
- enum netfs_read_from_hole read_hole)
-{
- struct netfs_cache_resources *cres = &rreq->cache_resources;
- struct iov_iter iter;
-
- netfs_stat(&netfs_n_rh_read);
- iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages,
- subreq->start + subreq->transferred,
- subreq->len - subreq->transferred);
-
- cres->ops->read(cres, subreq->start, &iter, read_hole,
- netfs_cache_read_terminated, subreq);
-}
-
-/*
- * Fill a subrequest region with zeroes.
- */
-static void netfs_fill_with_zeroes(struct netfs_read_request *rreq,
- struct netfs_read_subrequest *subreq)
-{
- netfs_stat(&netfs_n_rh_zero);
- __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
- netfs_subreq_terminated(subreq, 0, false);
-}
-
-/*
- * Ask the netfs to issue a read request to the server for us.
- *
- * The netfs is expected to read from subreq->pos + subreq->transferred to
- * subreq->pos + subreq->len - 1. It may not backtrack and write data into the
- * buffer prior to the transferred point as it might clobber dirty data
- * obtained from the cache.
- *
- * Alternatively, the netfs is allowed to indicate one of two things:
- *
- * - NETFS_SREQ_SHORT_READ: A short read - it will get called again to try and
- * make progress.
- *
- * - NETFS_SREQ_CLEAR_TAIL: A short read - the rest of the buffer will be
- * cleared.
- */
-static void netfs_read_from_server(struct netfs_read_request *rreq,
- struct netfs_read_subrequest *subreq)
-{
- netfs_stat(&netfs_n_rh_download);
- rreq->netfs_ops->issue_op(subreq);
-}
-
-/*
- * Release those waiting.
- */
-static void netfs_rreq_completed(struct netfs_read_request *rreq, bool was_async)
-{
- trace_netfs_rreq(rreq, netfs_rreq_trace_done);
- netfs_rreq_clear_subreqs(rreq, was_async);
- netfs_put_read_request(rreq, was_async);
-}
-
-/*
- * Deal with the completion of writing the data to the cache. We have to clear
- * the PG_fscache bits on the folios involved and release the caller's ref.
- *
- * May be called in softirq mode and we inherit a ref from the caller.
- */
-static void netfs_rreq_unmark_after_write(struct netfs_read_request *rreq,
- bool was_async)
-{
- struct netfs_read_subrequest *subreq;
- struct folio *folio;
- pgoff_t unlocked = 0;
- bool have_unlocked = false;
-
- rcu_read_lock();
-
- list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
- XA_STATE(xas, &rreq->mapping->i_pages, subreq->start / PAGE_SIZE);
-
- xas_for_each(&xas, folio, (subreq->start + subreq->len - 1) / PAGE_SIZE) {
- /* We might have multiple writes from the same huge
- * folio, but we mustn't unlock a folio more than once.
- */
- if (have_unlocked && folio_index(folio) <= unlocked)
- continue;
- unlocked = folio_index(folio);
- folio_end_fscache(folio);
- have_unlocked = true;
- }
- }
-
- rcu_read_unlock();
- netfs_rreq_completed(rreq, was_async);
-}
-
-static void netfs_rreq_copy_terminated(void *priv, ssize_t transferred_or_error,
- bool was_async)
-{
- struct netfs_read_subrequest *subreq = priv;
- struct netfs_read_request *rreq = subreq->rreq;
-
- if (IS_ERR_VALUE(transferred_or_error)) {
- netfs_stat(&netfs_n_rh_write_failed);
- trace_netfs_failure(rreq, subreq, transferred_or_error,
- netfs_fail_copy_to_cache);
- } else {
- netfs_stat(&netfs_n_rh_write_done);
- }
-
- trace_netfs_sreq(subreq, netfs_sreq_trace_write_term);
-
- /* If we decrement nr_wr_ops to 0, the ref belongs to us. */
- if (atomic_dec_and_test(&rreq->nr_wr_ops))
- netfs_rreq_unmark_after_write(rreq, was_async);
-
- netfs_put_subrequest(subreq, was_async);
-}
-
-/*
- * Perform any outstanding writes to the cache. We inherit a ref from the
- * caller.
- */
-static void netfs_rreq_do_write_to_cache(struct netfs_read_request *rreq)
-{
- struct netfs_cache_resources *cres = &rreq->cache_resources;
- struct netfs_read_subrequest *subreq, *next, *p;
- struct iov_iter iter;
- int ret;
-
- trace_netfs_rreq(rreq, netfs_rreq_trace_write);
-
- /* We don't want terminating writes trying to wake us up whilst we're
- * still going through the list.
- */
- atomic_inc(&rreq->nr_wr_ops);
-
- list_for_each_entry_safe(subreq, p, &rreq->subrequests, rreq_link) {
- if (!test_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags)) {
- list_del_init(&subreq->rreq_link);
- netfs_put_subrequest(subreq, false);
- }
- }
-
- list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
- /* Amalgamate adjacent writes */
- while (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
- next = list_next_entry(subreq, rreq_link);
- if (next->start != subreq->start + subreq->len)
- break;
- subreq->len += next->len;
- list_del_init(&next->rreq_link);
- netfs_put_subrequest(next, false);
- }
-
- ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len,
- rreq->i_size, true);
- if (ret < 0) {
- trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write);
- trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip);
- continue;
- }
-
- iov_iter_xarray(&iter, WRITE, &rreq->mapping->i_pages,
- subreq->start, subreq->len);
-
- atomic_inc(&rreq->nr_wr_ops);
- netfs_stat(&netfs_n_rh_write);
- netfs_get_read_subrequest(subreq);
- trace_netfs_sreq(subreq, netfs_sreq_trace_write);
- cres->ops->write(cres, subreq->start, &iter,
- netfs_rreq_copy_terminated, subreq);
- }
-
- /* If we decrement nr_wr_ops to 0, the usage ref belongs to us. */
- if (atomic_dec_and_test(&rreq->nr_wr_ops))
- netfs_rreq_unmark_after_write(rreq, false);
-}
-
-static void netfs_rreq_write_to_cache_work(struct work_struct *work)
-{
- struct netfs_read_request *rreq =
- container_of(work, struct netfs_read_request, work);
-
- netfs_rreq_do_write_to_cache(rreq);
-}
-
-static void netfs_rreq_write_to_cache(struct netfs_read_request *rreq)
-{
- rreq->work.func = netfs_rreq_write_to_cache_work;
- if (!queue_work(system_unbound_wq, &rreq->work))
- BUG();
-}
-
-/*
- * Unlock the folios in a read operation. We need to set PG_fscache on any
- * folios we're going to write back before we unlock them.
- */
-static void netfs_rreq_unlock(struct netfs_read_request *rreq)
-{
- struct netfs_read_subrequest *subreq;
- struct folio *folio;
- unsigned int iopos, account = 0;
- pgoff_t start_page = rreq->start / PAGE_SIZE;
- pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
- bool subreq_failed = false;
-
- XA_STATE(xas, &rreq->mapping->i_pages, start_page);
-
- if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) {
- __clear_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags);
- list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
- __clear_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags);
- }
- }
-
- /* Walk through the pagecache and the I/O request lists simultaneously.
- * We may have a mixture of cached and uncached sections and we only
- * really want to write out the uncached sections. This is slightly
- * complicated by the possibility that we might have huge pages with a
- * mixture inside.
- */
- subreq = list_first_entry(&rreq->subrequests,
- struct netfs_read_subrequest, rreq_link);
- iopos = 0;
- subreq_failed = (subreq->error < 0);
-
- trace_netfs_rreq(rreq, netfs_rreq_trace_unlock);
-
- rcu_read_lock();
- xas_for_each(&xas, folio, last_page) {
- unsigned int pgpos = (folio_index(folio) - start_page) * PAGE_SIZE;
- unsigned int pgend = pgpos + folio_size(folio);
- bool pg_failed = false;
-
- for (;;) {
- if (!subreq) {
- pg_failed = true;
- break;
- }
- if (test_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags))
- folio_start_fscache(folio);
- pg_failed |= subreq_failed;
- if (pgend < iopos + subreq->len)
- break;
-
- account += subreq->transferred;
- iopos += subreq->len;
- if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
- subreq = list_next_entry(subreq, rreq_link);
- subreq_failed = (subreq->error < 0);
- } else {
- subreq = NULL;
- subreq_failed = false;
- }
- if (pgend == iopos)
- break;
- }
-
- if (!pg_failed) {
- flush_dcache_folio(folio);
- folio_mark_uptodate(folio);
- }
-
- if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
- if (folio_index(folio) == rreq->no_unlock_folio &&
- test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags))
- _debug("no unlock");
- else
- folio_unlock(folio);
- }
- }
- rcu_read_unlock();
-
- task_io_account_read(account);
- if (rreq->netfs_ops->done)
- rreq->netfs_ops->done(rreq);
-}
-
-/*
- * Handle a short read.
- */
-static void netfs_rreq_short_read(struct netfs_read_request *rreq,
- struct netfs_read_subrequest *subreq)
-{
- __clear_bit(NETFS_SREQ_SHORT_READ, &subreq->flags);
- __set_bit(NETFS_SREQ_SEEK_DATA_READ, &subreq->flags);
-
- netfs_stat(&netfs_n_rh_short_read);
- trace_netfs_sreq(subreq, netfs_sreq_trace_resubmit_short);
-
- netfs_get_read_subrequest(subreq);
- atomic_inc(&rreq->nr_rd_ops);
- if (subreq->source == NETFS_READ_FROM_CACHE)
- netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_CLEAR);
- else
- netfs_read_from_server(rreq, subreq);
-}
-
-/*
- * Resubmit any short or failed operations. Returns true if we got the rreq
- * ref back.
- */
-static bool netfs_rreq_perform_resubmissions(struct netfs_read_request *rreq)
-{
- struct netfs_read_subrequest *subreq;
-
- WARN_ON(in_interrupt());
-
- trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit);
-
- /* We don't want terminating submissions trying to wake us up whilst
- * we're still going through the list.
- */
- atomic_inc(&rreq->nr_rd_ops);
-
- __clear_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
- list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
- if (subreq->error) {
- if (subreq->source != NETFS_READ_FROM_CACHE)
- break;
- subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
- subreq->error = 0;
- netfs_stat(&netfs_n_rh_download_instead);
- trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead);
- netfs_get_read_subrequest(subreq);
- atomic_inc(&rreq->nr_rd_ops);
- netfs_read_from_server(rreq, subreq);
- } else if (test_bit(NETFS_SREQ_SHORT_READ, &subreq->flags)) {
- netfs_rreq_short_read(rreq, subreq);
- }
- }
-
- /* If we decrement nr_rd_ops to 0, the usage ref belongs to us. */
- if (atomic_dec_and_test(&rreq->nr_rd_ops))
- return true;
-
- wake_up_var(&rreq->nr_rd_ops);
- return false;
-}
-
-/*
- * Check to see if the data read is still valid.
- */
-static void netfs_rreq_is_still_valid(struct netfs_read_request *rreq)
-{
- struct netfs_read_subrequest *subreq;
-
- if (!rreq->netfs_ops->is_still_valid ||
- rreq->netfs_ops->is_still_valid(rreq))
- return;
-
- list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
- if (subreq->source == NETFS_READ_FROM_CACHE) {
- subreq->error = -ESTALE;
- __set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
- }
- }
-}
-
-/*
- * Assess the state of a read request and decide what to do next.
- *
- * Note that we could be in an ordinary kernel thread, on a workqueue or in
- * softirq context at this point. We inherit a ref from the caller.
- */
-static void netfs_rreq_assess(struct netfs_read_request *rreq, bool was_async)
-{
- trace_netfs_rreq(rreq, netfs_rreq_trace_assess);
-
-again:
- netfs_rreq_is_still_valid(rreq);
-
- if (!test_bit(NETFS_RREQ_FAILED, &rreq->flags) &&
- test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags)) {
- if (netfs_rreq_perform_resubmissions(rreq))
- goto again;
- return;
- }
-
- netfs_rreq_unlock(rreq);
-
- clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
- wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
-
- if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags))
- return netfs_rreq_write_to_cache(rreq);
-
- netfs_rreq_completed(rreq, was_async);
-}
-
-static void netfs_rreq_work(struct work_struct *work)
-{
- struct netfs_read_request *rreq =
- container_of(work, struct netfs_read_request, work);
- netfs_rreq_assess(rreq, false);
-}
-
-/*
- * Handle the completion of all outstanding I/O operations on a read request.
- * We inherit a ref from the caller.
- */
-static void netfs_rreq_terminated(struct netfs_read_request *rreq,
- bool was_async)
-{
- if (test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags) &&
- was_async) {
- if (!queue_work(system_unbound_wq, &rreq->work))
- BUG();
- } else {
- netfs_rreq_assess(rreq, was_async);
- }
-}
-
-/**
- * netfs_subreq_terminated - Note the termination of an I/O operation.
- * @subreq: The I/O request that has terminated.
- * @transferred_or_error: The amount of data transferred or an error code.
- * @was_async: The termination was asynchronous
- *
- * This tells the read helper that a contributory I/O operation has terminated,
- * one way or another, and that it should integrate the results.
- *
- * The caller indicates in @transferred_or_error the outcome of the operation,
- * supplying a positive value to indicate the number of bytes transferred, 0 to
- * indicate a failure to transfer anything that should be retried or a negative
- * error code. The helper will look after reissuing I/O operations as
- * appropriate and writing downloaded data to the cache.
- *
- * If @was_async is true, the caller might be running in softirq or interrupt
- * context and we can't sleep.
- */
-void netfs_subreq_terminated(struct netfs_read_subrequest *subreq,
- ssize_t transferred_or_error,
- bool was_async)
-{
- struct netfs_read_request *rreq = subreq->rreq;
- int u;
-
- _enter("[%u]{%llx,%lx},%zd",
- subreq->debug_index, subreq->start, subreq->flags,
- transferred_or_error);
-
- switch (subreq->source) {
- case NETFS_READ_FROM_CACHE:
- netfs_stat(&netfs_n_rh_read_done);
- break;
- case NETFS_DOWNLOAD_FROM_SERVER:
- netfs_stat(&netfs_n_rh_download_done);
- break;
- default:
- break;
- }
-
- if (IS_ERR_VALUE(transferred_or_error)) {
- subreq->error = transferred_or_error;
- trace_netfs_failure(rreq, subreq, transferred_or_error,
- netfs_fail_read);
- goto failed;
- }
-
- if (WARN(transferred_or_error > subreq->len - subreq->transferred,
- "Subreq overread: R%x[%x] %zd > %zu - %zu",
- rreq->debug_id, subreq->debug_index,
- transferred_or_error, subreq->len, subreq->transferred))
- transferred_or_error = subreq->len - subreq->transferred;
-
- subreq->error = 0;
- subreq->transferred += transferred_or_error;
- if (subreq->transferred < subreq->len)
- goto incomplete;
-
-complete:
- __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
- if (test_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags))
- set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags);
-
-out:
- trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
-
- /* If we decrement nr_rd_ops to 0, the ref belongs to us. */
- u = atomic_dec_return(&rreq->nr_rd_ops);
- if (u == 0)
- netfs_rreq_terminated(rreq, was_async);
- else if (u == 1)
- wake_up_var(&rreq->nr_rd_ops);
-
- netfs_put_subrequest(subreq, was_async);
- return;
-
-incomplete:
- if (test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags)) {
- netfs_clear_unread(subreq);
- subreq->transferred = subreq->len;
- goto complete;
- }
-
- if (transferred_or_error == 0) {
- if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) {
- subreq->error = -ENODATA;
- goto failed;
- }
- } else {
- __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
- }
-
- __set_bit(NETFS_SREQ_SHORT_READ, &subreq->flags);
- set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
- goto out;
-
-failed:
- if (subreq->source == NETFS_READ_FROM_CACHE) {
- netfs_stat(&netfs_n_rh_read_failed);
- set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
- } else {
- netfs_stat(&netfs_n_rh_download_failed);
- set_bit(NETFS_RREQ_FAILED, &rreq->flags);
- rreq->error = subreq->error;
- }
- goto out;
-}
-EXPORT_SYMBOL(netfs_subreq_terminated);
-
-static enum netfs_read_source netfs_cache_prepare_read(struct netfs_read_subrequest *subreq,
- loff_t i_size)
-{
- struct netfs_read_request *rreq = subreq->rreq;
- struct netfs_cache_resources *cres = &rreq->cache_resources;
-
- if (cres->ops)
- return cres->ops->prepare_read(subreq, i_size);
- if (subreq->start >= rreq->i_size)
- return NETFS_FILL_WITH_ZEROES;
- return NETFS_DOWNLOAD_FROM_SERVER;
-}
-
-/*
- * Work out what sort of subrequest the next one will be.
- */
-static enum netfs_read_source
-netfs_rreq_prepare_read(struct netfs_read_request *rreq,
- struct netfs_read_subrequest *subreq)
-{
- enum netfs_read_source source;
-
- _enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size);
-
- source = netfs_cache_prepare_read(subreq, rreq->i_size);
- if (source == NETFS_INVALID_READ)
- goto out;
-
- if (source == NETFS_DOWNLOAD_FROM_SERVER) {
- /* Call out to the netfs to let it shrink the request to fit
- * its own I/O sizes and boundaries. If it shinks it here, it
- * will be called again to make simultaneous calls; if it wants
- * to make serial calls, it can indicate a short read and then
- * we will call it again.
- */
- if (subreq->len > rreq->i_size - subreq->start)
- subreq->len = rreq->i_size - subreq->start;
-
- if (rreq->netfs_ops->clamp_length &&
- !rreq->netfs_ops->clamp_length(subreq)) {
- source = NETFS_INVALID_READ;
- goto out;
- }
- }
-
- if (WARN_ON(subreq->len == 0))
- source = NETFS_INVALID_READ;
-
-out:
- subreq->source = source;
- trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
- return source;
-}
-
-/*
- * Slice off a piece of a read request and submit an I/O request for it.
- */
-static bool netfs_rreq_submit_slice(struct netfs_read_request *rreq,
- unsigned int *_debug_index)
-{
- struct netfs_read_subrequest *subreq;
- enum netfs_read_source source;
-
- subreq = netfs_alloc_subrequest(rreq);
- if (!subreq)
- return false;
-
- subreq->debug_index = (*_debug_index)++;
- subreq->start = rreq->start + rreq->submitted;
- subreq->len = rreq->len - rreq->submitted;
-
- _debug("slice %llx,%zx,%zx", subreq->start, subreq->len, rreq->submitted);
- list_add_tail(&subreq->rreq_link, &rreq->subrequests);
-
- /* Call out to the cache to find out what it can do with the remaining
- * subset. It tells us in subreq->flags what it decided should be done
- * and adjusts subreq->len down if the subset crosses a cache boundary.
- *
- * Then when we hand the subset, it can choose to take a subset of that
- * (the starts must coincide), in which case, we go around the loop
- * again and ask it to download the next piece.
- */
- source = netfs_rreq_prepare_read(rreq, subreq);
- if (source == NETFS_INVALID_READ)
- goto subreq_failed;
-
- atomic_inc(&rreq->nr_rd_ops);
-
- rreq->submitted += subreq->len;
-
- trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
- switch (source) {
- case NETFS_FILL_WITH_ZEROES:
- netfs_fill_with_zeroes(rreq, subreq);
- break;
- case NETFS_DOWNLOAD_FROM_SERVER:
- netfs_read_from_server(rreq, subreq);
- break;
- case NETFS_READ_FROM_CACHE:
- netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_IGNORE);
- break;
- default:
- BUG();
- }
-
- return true;
-
-subreq_failed:
- rreq->error = subreq->error;
- netfs_put_subrequest(subreq, false);
- return false;
-}
-
-static void netfs_cache_expand_readahead(struct netfs_read_request *rreq,
- loff_t *_start, size_t *_len, loff_t i_size)
-{
- struct netfs_cache_resources *cres = &rreq->cache_resources;
-
- if (cres->ops && cres->ops->expand_readahead)
- cres->ops->expand_readahead(cres, _start, _len, i_size);
-}
-
-static void netfs_rreq_expand(struct netfs_read_request *rreq,
- struct readahead_control *ractl)
-{
- /* Give the cache a chance to change the request parameters. The
- * resultant request must contain the original region.
- */
- netfs_cache_expand_readahead(rreq, &rreq->start, &rreq->len, rreq->i_size);
-
- /* Give the netfs a chance to change the request parameters. The
- * resultant request must contain the original region.
- */
- if (rreq->netfs_ops->expand_readahead)
- rreq->netfs_ops->expand_readahead(rreq);
-
- /* Expand the request if the cache wants it to start earlier. Note
- * that the expansion may get further extended if the VM wishes to
- * insert THPs and the preferred start and/or end wind up in the middle
- * of THPs.
- *
- * If this is the case, however, the THP size should be an integer
- * multiple of the cache granule size, so we get a whole number of
- * granules to deal with.
- */
- if (rreq->start != readahead_pos(ractl) ||
- rreq->len != readahead_length(ractl)) {
- readahead_expand(ractl, rreq->start, rreq->len);
- rreq->start = readahead_pos(ractl);
- rreq->len = readahead_length(ractl);
-
- trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
- netfs_read_trace_expanded);
- }
-}
-
-/**
- * netfs_readahead - Helper to manage a read request
- * @ractl: The description of the readahead request
- * @ops: The network filesystem's operations for the helper to use
- * @netfs_priv: Private netfs data to be retained in the request
- *
- * Fulfil a readahead request by drawing data from the cache if possible, or
- * the netfs if not. Space beyond the EOF is zero-filled. Multiple I/O
- * requests from different sources will get munged together. If necessary, the
- * readahead window can be expanded in either direction to a more convenient
- * alighment for RPC efficiency or to make storage in the cache feasible.
- *
- * The calling netfs must provide a table of operations, only one of which,
- * issue_op, is mandatory. It may also be passed a private token, which will
- * be retained in rreq->netfs_priv and will be cleaned up by ops->cleanup().
- *
- * This is usable whether or not caching is enabled.
- */
-void netfs_readahead(struct readahead_control *ractl,
- const struct netfs_read_request_ops *ops,
- void *netfs_priv)
-{
- struct netfs_read_request *rreq;
- unsigned int debug_index = 0;
- int ret;
-
- _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
-
- if (readahead_count(ractl) == 0)
- goto cleanup;
-
- rreq = netfs_alloc_read_request(ops, netfs_priv, ractl->file);
- if (!rreq)
- goto cleanup;
- rreq->mapping = ractl->mapping;
- rreq->start = readahead_pos(ractl);
- rreq->len = readahead_length(ractl);
-
- if (ops->begin_cache_operation) {
- ret = ops->begin_cache_operation(rreq);
- if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
- goto cleanup_free;
- }
-
- netfs_stat(&netfs_n_rh_readahead);
- trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
- netfs_read_trace_readahead);
-
- netfs_rreq_expand(rreq, ractl);
-
- atomic_set(&rreq->nr_rd_ops, 1);
- do {
- if (!netfs_rreq_submit_slice(rreq, &debug_index))
- break;
-
- } while (rreq->submitted < rreq->len);
-
- /* Drop the refs on the folios here rather than in the cache or
- * filesystem. The locks will be dropped in netfs_rreq_unlock().
- */
- while (readahead_folio(ractl))
- ;
-
- /* If we decrement nr_rd_ops to 0, the ref belongs to us. */
- if (atomic_dec_and_test(&rreq->nr_rd_ops))
- netfs_rreq_assess(rreq, false);
- return;
-
-cleanup_free:
- netfs_put_read_request(rreq, false);
- return;
-cleanup:
- if (netfs_priv)
- ops->cleanup(ractl->mapping, netfs_priv);
- return;
-}
-EXPORT_SYMBOL(netfs_readahead);
-
-/**
- * netfs_readpage - Helper to manage a readpage request
- * @file: The file to read from
- * @folio: The folio to read
- * @ops: The network filesystem's operations for the helper to use
- * @netfs_priv: Private netfs data to be retained in the request
- *
- * Fulfil a readpage request by drawing data from the cache if possible, or the
- * netfs if not. Space beyond the EOF is zero-filled. Multiple I/O requests
- * from different sources will get munged together.
- *
- * The calling netfs must provide a table of operations, only one of which,
- * issue_op, is mandatory. It may also be passed a private token, which will
- * be retained in rreq->netfs_priv and will be cleaned up by ops->cleanup().
- *
- * This is usable whether or not caching is enabled.
- */
-int netfs_readpage(struct file *file,
- struct folio *folio,
- const struct netfs_read_request_ops *ops,
- void *netfs_priv)
-{
- struct netfs_read_request *rreq;
- unsigned int debug_index = 0;
- int ret;
-
- _enter("%lx", folio_index(folio));
-
- rreq = netfs_alloc_read_request(ops, netfs_priv, file);
- if (!rreq) {
- if (netfs_priv)
- ops->cleanup(folio_file_mapping(folio), netfs_priv);
- folio_unlock(folio);
- return -ENOMEM;
- }
- rreq->mapping = folio_file_mapping(folio);
- rreq->start = folio_file_pos(folio);
- rreq->len = folio_size(folio);
-
- if (ops->begin_cache_operation) {
- ret = ops->begin_cache_operation(rreq);
- if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) {
- folio_unlock(folio);
- goto out;
- }
- }
-
- netfs_stat(&netfs_n_rh_readpage);
- trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
-
- netfs_get_read_request(rreq);
-
- atomic_set(&rreq->nr_rd_ops, 1);
- do {
- if (!netfs_rreq_submit_slice(rreq, &debug_index))
- break;
-
- } while (rreq->submitted < rreq->len);
-
- /* Keep nr_rd_ops incremented so that the ref always belongs to us, and
- * the service code isn't punted off to a random thread pool to
- * process.
- */
- do {
- wait_var_event(&rreq->nr_rd_ops, atomic_read(&rreq->nr_rd_ops) == 1);
- netfs_rreq_assess(rreq, false);
- } while (test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags));
-
- ret = rreq->error;
- if (ret == 0 && rreq->submitted < rreq->len) {
- trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_readpage);
- ret = -EIO;
- }
-out:
- netfs_put_read_request(rreq, false);
- return ret;
-}
-EXPORT_SYMBOL(netfs_readpage);
-
-/*
- * Prepare a folio for writing without reading first
- * @folio: The folio being prepared
- * @pos: starting position for the write
- * @len: length of write
- *
- * In some cases, write_begin doesn't need to read at all:
- * - full folio write
- * - write that lies in a folio that is completely beyond EOF
- * - write that covers the folio from start to EOF or beyond it
- *
- * If any of these criteria are met, then zero out the unwritten parts
- * of the folio and return true. Otherwise, return false.
- */
-static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len)
-{
- struct inode *inode = folio_inode(folio);
- loff_t i_size = i_size_read(inode);
- size_t offset = offset_in_folio(folio, pos);
-
- /* Full folio write */
- if (offset == 0 && len >= folio_size(folio))
- return true;
-
- /* pos beyond last folio in the file */
- if (pos - offset >= i_size)
- goto zero_out;
-
- /* Write that covers from the start of the folio to EOF or beyond */
- if (offset == 0 && (pos + len) >= i_size)
- goto zero_out;
-
- return false;
-zero_out:
- zero_user_segments(&folio->page, 0, offset, offset + len, folio_size(folio));
- return true;
-}
-
-/**
- * netfs_write_begin - Helper to prepare for writing
- * @file: The file to read from
- * @mapping: The mapping to read from
- * @pos: File position at which the write will begin
- * @len: The length of the write (may extend beyond the end of the folio chosen)
- * @aop_flags: AOP_* flags
- * @_folio: Where to put the resultant folio
- * @_fsdata: Place for the netfs to store a cookie
- * @ops: The network filesystem's operations for the helper to use
- * @netfs_priv: Private netfs data to be retained in the request
- *
- * Pre-read data for a write-begin request by drawing data from the cache if
- * possible, or the netfs if not. Space beyond the EOF is zero-filled.
- * Multiple I/O requests from different sources will get munged together. If
- * necessary, the readahead window can be expanded in either direction to a
- * more convenient alighment for RPC efficiency or to make storage in the cache
- * feasible.
- *
- * The calling netfs must provide a table of operations, only one of which,
- * issue_op, is mandatory.
- *
- * The check_write_begin() operation can be provided to check for and flush
- * conflicting writes once the folio is grabbed and locked. It is passed a
- * pointer to the fsdata cookie that gets returned to the VM to be passed to
- * write_end. It is permitted to sleep. It should return 0 if the request
- * should go ahead; unlock the folio and return -EAGAIN to cause the folio to
- * be regot; or return an error.
- *
- * This is usable whether or not caching is enabled.
- */
-int netfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned int len, unsigned int aop_flags,
- struct folio **_folio, void **_fsdata,
- const struct netfs_read_request_ops *ops,
- void *netfs_priv)
-{
- struct netfs_read_request *rreq;
- struct folio *folio;
- struct inode *inode = file_inode(file);
- unsigned int debug_index = 0, fgp_flags;
- pgoff_t index = pos >> PAGE_SHIFT;
- int ret;
-
- DEFINE_READAHEAD(ractl, file, NULL, mapping, index);
-
-retry:
- fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
- if (aop_flags & AOP_FLAG_NOFS)
- fgp_flags |= FGP_NOFS;
- folio = __filemap_get_folio(mapping, index, fgp_flags,
- mapping_gfp_mask(mapping));
- if (!folio)
- return -ENOMEM;
-
- if (ops->check_write_begin) {
- /* Allow the netfs (eg. ceph) to flush conflicts. */
- ret = ops->check_write_begin(file, pos, len, folio, _fsdata);
- if (ret < 0) {
- trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin);
- if (ret == -EAGAIN)
- goto retry;
- goto error;
- }
- }
-
- if (folio_test_uptodate(folio))
- goto have_folio;
-
- /* If the page is beyond the EOF, we want to clear it - unless it's
- * within the cache granule containing the EOF, in which case we need
- * to preload the granule.
- */
- if (!ops->is_cache_enabled(inode) &&
- netfs_skip_folio_read(folio, pos, len)) {
- netfs_stat(&netfs_n_rh_write_zskip);
- goto have_folio_no_wait;
- }
-
- ret = -ENOMEM;
- rreq = netfs_alloc_read_request(ops, netfs_priv, file);
- if (!rreq)
- goto error;
- rreq->mapping = folio_file_mapping(folio);
- rreq->start = folio_file_pos(folio);
- rreq->len = folio_size(folio);
- rreq->no_unlock_folio = folio_index(folio);
- __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
- netfs_priv = NULL;
-
- if (ops->begin_cache_operation) {
- ret = ops->begin_cache_operation(rreq);
- if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
- goto error_put;
- }
-
- netfs_stat(&netfs_n_rh_write_begin);
- trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);
-
- /* Expand the request to meet caching requirements and download
- * preferences.
- */
- ractl._nr_pages = folio_nr_pages(folio);
- netfs_rreq_expand(rreq, &ractl);
- netfs_get_read_request(rreq);
-
- /* We hold the folio locks, so we can drop the references */
- folio_get(folio);
- while (readahead_folio(&ractl))
- ;
-
- atomic_set(&rreq->nr_rd_ops, 1);
- do {
- if (!netfs_rreq_submit_slice(rreq, &debug_index))
- break;
-
- } while (rreq->submitted < rreq->len);
-
- /* Keep nr_rd_ops incremented so that the ref always belongs to us, and
- * the service code isn't punted off to a random thread pool to
- * process.
- */
- for (;;) {
- wait_var_event(&rreq->nr_rd_ops, atomic_read(&rreq->nr_rd_ops) == 1);
- netfs_rreq_assess(rreq, false);
- if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags))
- break;
- cond_resched();
- }
-
- ret = rreq->error;
- if (ret == 0 && rreq->submitted < rreq->len) {
- trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_write_begin);
- ret = -EIO;
- }
- netfs_put_read_request(rreq, false);
- if (ret < 0)
- goto error;
-
-have_folio:
- ret = folio_wait_fscache_killable(folio);
- if (ret < 0)
- goto error;
-have_folio_no_wait:
- if (netfs_priv)
- ops->cleanup(mapping, netfs_priv);
- *_folio = folio;
- _leave(" = 0");
- return 0;
-
-error_put:
- netfs_put_read_request(rreq, false);
-error:
- folio_unlock(folio);
- folio_put(folio);
- if (netfs_priv)
- ops->cleanup(mapping, netfs_priv);
- _leave(" = %d", ret);
- return ret;
-}
-EXPORT_SYMBOL(netfs_write_begin);
diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c
index 9ae538c85378..5510a7a14a40 100644
--- a/fs/netfs/stats.c
+++ b/fs/netfs/stats.c
@@ -7,7 +7,6 @@
#include <linux/export.h>
#include <linux/seq_file.h>
-#include <linux/netfs.h>
#include "internal.h"
atomic_t netfs_n_rh_readahead;
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index fe860c538747..79a8b451791f 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -115,23 +115,6 @@ bl_submit_bio(struct bio *bio)
return NULL;
}
-static struct bio *bl_alloc_init_bio(unsigned int npg,
- struct block_device *bdev, sector_t disk_sector,
- bio_end_io_t end_io, struct parallel_io *par)
-{
- struct bio *bio;
-
- npg = bio_max_segs(npg);
- bio = bio_alloc(GFP_NOIO, npg);
- if (bio) {
- bio->bi_iter.bi_sector = disk_sector;
- bio_set_dev(bio, bdev);
- bio->bi_end_io = end_io;
- bio->bi_private = par;
- }
- return bio;
-}
-
static bool offset_in_map(u64 offset, struct pnfs_block_dev_map *map)
{
return offset >= map->start && offset < map->start + map->len;
@@ -171,11 +154,10 @@ do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
retry:
if (!bio) {
- bio = bl_alloc_init_bio(npg, map->bdev,
- disk_addr >> SECTOR_SHIFT, end_io, par);
- if (!bio)
- return ERR_PTR(-ENOMEM);
- bio_set_op_attrs(bio, rw, 0);
+ bio = bio_alloc(map->bdev, bio_max_segs(npg), rw, GFP_NOIO);
+ bio->bi_iter.bi_sector = disk_addr >> SECTOR_SHIFT;
+ bio->bi_end_io = end_io;
+ bio->bi_private = par;
}
if (bio_add_page(bio, page, *len, offset) < *len) {
bio = bl_submit_bio(bio);
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
index ef9db135c649..6c977288cc28 100644
--- a/fs/nfs/blocklayout/rpc_pipefs.c
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -27,7 +27,6 @@
*/
#include <linux/module.h>
-#include <linux/genhd.h>
#include <linux/blkdev.h>
#include "blocklayout.h"
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 054cc1255fac..456af7d230cf 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -17,7 +17,6 @@
#include <linux/errno.h>
#include <linux/mutex.h>
#include <linux/freezer.h>
-#include <linux/kthread.h>
#include <linux/sunrpc/svcauth_gss.h>
#include <linux/sunrpc/bc_xprt.h>
@@ -45,18 +44,18 @@ static int nfs4_callback_up_net(struct svc_serv *serv, struct net *net)
int ret;
struct nfs_net *nn = net_generic(net, nfs_net_id);
- ret = svc_create_xprt(serv, "tcp", net, PF_INET,
- nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS,
- cred);
+ ret = svc_xprt_create(serv, "tcp", net, PF_INET,
+ nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS,
+ cred);
if (ret <= 0)
goto out_err;
nn->nfs_callback_tcpport = ret;
dprintk("NFS: Callback listener port = %u (af %u, net %x)\n",
nn->nfs_callback_tcpport, PF_INET, net->ns.inum);
- ret = svc_create_xprt(serv, "tcp", net, PF_INET6,
- nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS,
- cred);
+ ret = svc_xprt_create(serv, "tcp", net, PF_INET6,
+ nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS,
+ cred);
if (ret > 0) {
nn->nfs_callback_tcpport6 = ret;
dprintk("NFS: Callback listener port = %u (af %u, net %x)\n",
@@ -92,8 +91,8 @@ nfs4_callback_svc(void *vrqstp)
continue;
svc_process(rqstp);
}
+
svc_exit_thread(rqstp);
- module_put_and_kthread_exit(0);
return 0;
}
@@ -136,8 +135,8 @@ nfs41_callback_svc(void *vrqstp)
finish_wait(&serv->sv_cb_waitq, &wq);
}
}
+
svc_exit_thread(rqstp);
- module_put_and_kthread_exit(0);
return 0;
}
@@ -189,7 +188,7 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc
return;
dprintk("NFS: destroy per-net callback data; net=%x\n", net->ns.inum);
- svc_shutdown_net(serv, net);
+ svc_xprt_destroy_all(serv, net);
}
static int nfs_callback_up_net(int minorversion, struct svc_serv *serv,
@@ -232,33 +231,10 @@ err_bind:
return ret;
}
-static const struct svc_serv_ops nfs40_cb_sv_ops = {
- .svo_function = nfs4_callback_svc,
- .svo_enqueue_xprt = svc_xprt_do_enqueue,
- .svo_module = THIS_MODULE,
-};
-#if defined(CONFIG_NFS_V4_1)
-static const struct svc_serv_ops nfs41_cb_sv_ops = {
- .svo_function = nfs41_callback_svc,
- .svo_enqueue_xprt = svc_xprt_do_enqueue,
- .svo_module = THIS_MODULE,
-};
-
-static const struct svc_serv_ops *nfs4_cb_sv_ops[] = {
- [0] = &nfs40_cb_sv_ops,
- [1] = &nfs41_cb_sv_ops,
-};
-#else
-static const struct svc_serv_ops *nfs4_cb_sv_ops[] = {
- [0] = &nfs40_cb_sv_ops,
- [1] = NULL,
-};
-#endif
-
static struct svc_serv *nfs_callback_create_svc(int minorversion)
{
struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
- const struct svc_serv_ops *sv_ops;
+ int (*threadfn)(void *data);
struct svc_serv *serv;
/*
@@ -267,17 +243,6 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
if (cb_info->serv)
return svc_get(cb_info->serv);
- switch (minorversion) {
- case 0:
- sv_ops = nfs4_cb_sv_ops[0];
- break;
- default:
- sv_ops = nfs4_cb_sv_ops[1];
- }
-
- if (sv_ops == NULL)
- return ERR_PTR(-ENOTSUPP);
-
/*
* Sanity check: if there's no task,
* we should be the first user ...
@@ -286,7 +251,16 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n",
cb_info->users);
- serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops);
+ threadfn = nfs4_callback_svc;
+#if defined(CONFIG_NFS_V4_1)
+ if (minorversion)
+ threadfn = nfs41_callback_svc;
+#else
+ if (minorversion)
+ return ERR_PTR(-ENOTSUPP);
+#endif
+ serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE,
+ threadfn);
if (!serv) {
printk(KERN_ERR "nfs_callback_create_svc: create service failed\n");
return ERR_PTR(-ENOMEM);
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index c343666d9a42..c8520284dda7 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -358,12 +358,11 @@ __be32 nfs4_callback_devicenotify(void *argp, void *resp,
struct cb_process_state *cps)
{
struct cb_devicenotifyargs *args = argp;
+ const struct pnfs_layoutdriver_type *ld = NULL;
uint32_t i;
__be32 res = 0;
- struct nfs_client *clp = cps->clp;
- struct nfs_server *server = NULL;
- if (!clp) {
+ if (!cps->clp) {
res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
goto out;
}
@@ -371,23 +370,15 @@ __be32 nfs4_callback_devicenotify(void *argp, void *resp,
for (i = 0; i < args->ndevs; i++) {
struct cb_devicenotifyitem *dev = &args->devs[i];
- if (!server ||
- server->pnfs_curr_ld->id != dev->cbd_layout_type) {
- rcu_read_lock();
- list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
- if (server->pnfs_curr_ld &&
- server->pnfs_curr_ld->id == dev->cbd_layout_type) {
- rcu_read_unlock();
- goto found;
- }
- rcu_read_unlock();
- continue;
+ if (!ld || ld->id != dev->cbd_layout_type) {
+ pnfs_put_layoutdriver(ld);
+ ld = pnfs_find_layoutdriver(dev->cbd_layout_type);
+ if (!ld)
+ continue;
}
-
- found:
- nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id);
+ nfs4_delete_deviceid(ld, cps->clp, &dev->cbd_dev_id);
}
-
+ pnfs_put_layoutdriver(ld);
out:
kfree(args->devs);
return res;
@@ -710,7 +701,7 @@ __be32 nfs4_callback_offload(void *data, void *dummy,
struct nfs4_copy_state *copy, *tmp_copy;
bool found = false;
- copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS);
+ copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_KERNEL);
if (!copy)
return htonl(NFS4ERR_SERVERFAULT);
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index f90de8043b0f..8dcb08e1a885 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -271,10 +271,6 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp,
n = ntohl(*p++);
if (n == 0)
goto out;
- if (n > ULONG_MAX / sizeof(*args->devs)) {
- status = htonl(NFS4ERR_BADXDR);
- goto out;
- }
args->devs = kmalloc_array(n, sizeof(*args->devs), GFP_KERNEL);
if (!args->devs) {
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index d1f34229e11a..e828504cc396 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -857,7 +857,8 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
}
if (clp->rpc_ops->discover_trunking != NULL &&
- (server->caps & NFS_CAP_FS_LOCATIONS)) {
+ (server->caps & NFS_CAP_FS_LOCATIONS &&
+ (server->flags & NFS_MOUNT_TRUNK_DISCOVERY))) {
error = clp->rpc_ops->discover_trunking(server, mntfh);
if (error < 0)
return error;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 7c9eb679dbdb..5c97cad741a7 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -439,7 +439,7 @@ int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred,
struct nfs_delegation *freeme = NULL;
int status = 0;
- delegation = kmalloc(sizeof(*delegation), GFP_NOFS);
+ delegation = kmalloc(sizeof(*delegation), GFP_KERNEL_ACCOUNT);
if (delegation == NULL)
return -ENOMEM;
nfs4_stateid_copy(&delegation->stateid, stateid);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 75cb1cbe4cde..c6b263b5faf1 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -39,6 +39,7 @@
#include <linux/sched.h>
#include <linux/kmemleak.h>
#include <linux/xattr.h>
+#include <linux/hash.h>
#include "delegation.h"
#include "iostat.h"
@@ -69,26 +70,26 @@ const struct address_space_operations nfs_dir_aops = {
.freepage = nfs_readdir_clear_array,
};
-static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir)
+#define NFS_INIT_DTSIZE PAGE_SIZE
+
+static struct nfs_open_dir_context *
+alloc_nfs_open_dir_context(struct inode *dir)
{
struct nfs_inode *nfsi = NFS_I(dir);
struct nfs_open_dir_context *ctx;
- ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT);
if (ctx != NULL) {
- ctx->duped = 0;
ctx->attr_gencount = nfsi->attr_gencount;
- ctx->dir_cookie = 0;
- ctx->dup_cookie = 0;
- ctx->page_index = 0;
- ctx->eof = false;
+ ctx->dtsize = NFS_INIT_DTSIZE;
spin_lock(&dir->i_lock);
if (list_empty(&nfsi->open_files) &&
(nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER))
nfs_set_cache_invalid(dir,
NFS_INO_INVALID_DATA |
NFS_INO_REVAL_FORCED);
- list_add(&ctx->list, &nfsi->open_files);
- clear_bit(NFS_INO_FORCE_READDIR, &nfsi->flags);
+ list_add_tail_rcu(&ctx->list, &nfsi->open_files);
+ memcpy(ctx->verf, nfsi->cookieverf, sizeof(ctx->verf));
spin_unlock(&dir->i_lock);
return ctx;
}
@@ -98,9 +99,9 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir
static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_context *ctx)
{
spin_lock(&dir->i_lock);
- list_del(&ctx->list);
+ list_del_rcu(&ctx->list);
spin_unlock(&dir->i_lock);
- kfree(ctx);
+ kfree_rcu(ctx, rcu_head);
}
/*
@@ -142,6 +143,7 @@ struct nfs_cache_array_entry {
};
struct nfs_cache_array {
+ u64 change_attr;
u64 last_cookie;
unsigned int size;
unsigned char page_full : 1,
@@ -155,11 +157,10 @@ struct nfs_readdir_descriptor {
struct page *page;
struct dir_context *ctx;
pgoff_t page_index;
+ pgoff_t page_index_max;
u64 dir_cookie;
u64 last_cookie;
- u64 dup_cookie;
loff_t current_index;
- loff_t prev_index;
__be32 verf[NFS_DIR_VERIFIER_SIZE];
unsigned long dir_verifier;
@@ -167,24 +168,47 @@ struct nfs_readdir_descriptor {
unsigned long gencount;
unsigned long attr_gencount;
unsigned int cache_entry_index;
- signed char duped;
+ unsigned int buffer_fills;
+ unsigned int dtsize;
+ bool clear_cache;
bool plus;
bool eob;
bool eof;
};
-static void nfs_readdir_array_init(struct nfs_cache_array *array)
+static void nfs_set_dtsize(struct nfs_readdir_descriptor *desc, unsigned int sz)
+{
+ struct nfs_server *server = NFS_SERVER(file_inode(desc->file));
+ unsigned int maxsize = server->dtsize;
+
+ if (sz > maxsize)
+ sz = maxsize;
+ if (sz < NFS_MIN_FILE_IO_SIZE)
+ sz = NFS_MIN_FILE_IO_SIZE;
+ desc->dtsize = sz;
+}
+
+static void nfs_shrink_dtsize(struct nfs_readdir_descriptor *desc)
{
- memset(array, 0, sizeof(struct nfs_cache_array));
+ nfs_set_dtsize(desc, desc->dtsize >> 1);
}
-static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie)
+static void nfs_grow_dtsize(struct nfs_readdir_descriptor *desc)
+{
+ nfs_set_dtsize(desc, desc->dtsize << 1);
+}
+
+static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie,
+ u64 change_attr)
{
struct nfs_cache_array *array;
array = kmap_atomic(page);
- nfs_readdir_array_init(array);
+ array->change_attr = change_attr;
array->last_cookie = last_cookie;
+ array->size = 0;
+ array->page_full = 0;
+ array->page_is_eof = 0;
array->cookies_are_ordered = 1;
kunmap_atomic(array);
}
@@ -192,25 +216,31 @@ static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie)
/*
* we are freeing strings created by nfs_add_to_readdir_array()
*/
-static
-void nfs_readdir_clear_array(struct page *page)
+static void nfs_readdir_clear_array(struct page *page)
{
struct nfs_cache_array *array;
- int i;
+ unsigned int i;
array = kmap_atomic(page);
for (i = 0; i < array->size; i++)
kfree(array->array[i].name);
- nfs_readdir_array_init(array);
+ array->size = 0;
kunmap_atomic(array);
}
+static void nfs_readdir_page_reinit_array(struct page *page, u64 last_cookie,
+ u64 change_attr)
+{
+ nfs_readdir_clear_array(page);
+ nfs_readdir_page_init_array(page, last_cookie, change_attr);
+}
+
static struct page *
nfs_readdir_page_array_alloc(u64 last_cookie, gfp_t gfp_flags)
{
struct page *page = alloc_page(gfp_flags);
if (page)
- nfs_readdir_page_init_array(page, last_cookie);
+ nfs_readdir_page_init_array(page, last_cookie, 0);
return page;
}
@@ -222,6 +252,11 @@ static void nfs_readdir_page_array_free(struct page *page)
}
}
+static u64 nfs_readdir_array_index_cookie(struct nfs_cache_array *array)
+{
+ return array->size == 0 ? array->last_cookie : array->array[0].cookie;
+}
+
static void nfs_readdir_array_set_eof(struct nfs_cache_array *array)
{
array->page_is_eof = 1;
@@ -251,36 +286,40 @@ static const char *nfs_readdir_copy_name(const char *name, unsigned int len)
return ret;
}
+static size_t nfs_readdir_array_maxentries(void)
+{
+ return (PAGE_SIZE - sizeof(struct nfs_cache_array)) /
+ sizeof(struct nfs_cache_array_entry);
+}
+
/*
* Check that the next array entry lies entirely within the page bounds
*/
static int nfs_readdir_array_can_expand(struct nfs_cache_array *array)
{
- struct nfs_cache_array_entry *cache_entry;
-
if (array->page_full)
return -ENOSPC;
- cache_entry = &array->array[array->size + 1];
- if ((char *)cache_entry - (char *)array > PAGE_SIZE) {
+ if (array->size == nfs_readdir_array_maxentries()) {
array->page_full = 1;
return -ENOSPC;
}
return 0;
}
-static
-int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
+static int nfs_readdir_page_array_append(struct page *page,
+ const struct nfs_entry *entry,
+ u64 *cookie)
{
struct nfs_cache_array *array;
struct nfs_cache_array_entry *cache_entry;
const char *name;
- int ret;
+ int ret = -ENOMEM;
name = nfs_readdir_copy_name(entry->name, entry->len);
- if (!name)
- return -ENOMEM;
array = kmap_atomic(page);
+ if (!name)
+ goto out;
ret = nfs_readdir_array_can_expand(array);
if (ret) {
kfree(name);
@@ -288,7 +327,7 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
}
cache_entry = &array->array[array->size];
- cache_entry->cookie = entry->prev_cookie;
+ cache_entry->cookie = array->last_cookie;
cache_entry->ino = entry->ino;
cache_entry->d_type = entry->d_type;
cache_entry->name_len = entry->len;
@@ -300,23 +339,72 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
if (entry->eof != 0)
nfs_readdir_array_set_eof(array);
out:
+ *cookie = array->last_cookie;
+ kunmap_atomic(array);
+ return ret;
+}
+
+#define NFS_READDIR_COOKIE_MASK (U32_MAX >> 14)
+/*
+ * Hash algorithm allowing content addressible access to sequences
+ * of directory cookies. Content is addressed by the value of the
+ * cookie index of the first readdir entry in a page.
+ *
+ * We select only the first 18 bits to avoid issues with excessive
+ * memory use for the page cache XArray. 18 bits should allow the caching
+ * of 262144 pages of sequences of readdir entries. Since each page holds
+ * 127 readdir entries for a typical 64-bit system, that works out to a
+ * cache of ~ 33 million entries per directory.
+ */
+static pgoff_t nfs_readdir_page_cookie_hash(u64 cookie)
+{
+ if (cookie == 0)
+ return 0;
+ return hash_64(cookie, 18);
+}
+
+static bool nfs_readdir_page_validate(struct page *page, u64 last_cookie,
+ u64 change_attr)
+{
+ struct nfs_cache_array *array = kmap_atomic(page);
+ int ret = true;
+
+ if (array->change_attr != change_attr)
+ ret = false;
+ if (nfs_readdir_array_index_cookie(array) != last_cookie)
+ ret = false;
kunmap_atomic(array);
return ret;
}
+static void nfs_readdir_page_unlock_and_put(struct page *page)
+{
+ unlock_page(page);
+ put_page(page);
+}
+
+static void nfs_readdir_page_init_and_validate(struct page *page, u64 cookie,
+ u64 change_attr)
+{
+ if (PageUptodate(page)) {
+ if (nfs_readdir_page_validate(page, cookie, change_attr))
+ return;
+ nfs_readdir_clear_array(page);
+ }
+ nfs_readdir_page_init_array(page, cookie, change_attr);
+ SetPageUptodate(page);
+}
+
static struct page *nfs_readdir_page_get_locked(struct address_space *mapping,
- pgoff_t index, u64 last_cookie)
+ u64 cookie, u64 change_attr)
{
+ pgoff_t index = nfs_readdir_page_cookie_hash(cookie);
struct page *page;
page = grab_cache_page(mapping, index);
- if (page && !PageUptodate(page)) {
- nfs_readdir_page_init_array(page, last_cookie);
- if (invalidate_inode_pages2_range(mapping, index + 1, -1) < 0)
- nfs_zap_mapping(mapping->host, mapping);
- SetPageUptodate(page);
- }
-
+ if (!page)
+ return NULL;
+ nfs_readdir_page_init_and_validate(page, cookie, change_attr);
return page;
}
@@ -351,24 +439,19 @@ static void nfs_readdir_page_set_eof(struct page *page)
kunmap_atomic(array);
}
-static void nfs_readdir_page_unlock_and_put(struct page *page)
-{
- unlock_page(page);
- put_page(page);
-}
-
static struct page *nfs_readdir_page_get_next(struct address_space *mapping,
- pgoff_t index, u64 cookie)
+ u64 cookie, u64 change_attr)
{
+ pgoff_t index = nfs_readdir_page_cookie_hash(cookie);
struct page *page;
- page = nfs_readdir_page_get_locked(mapping, index, cookie);
- if (page) {
- if (nfs_readdir_page_last_cookie(page) == cookie)
- return page;
- nfs_readdir_page_unlock_and_put(page);
- }
- return NULL;
+ page = grab_cache_page_nowait(mapping, index);
+ if (!page)
+ return NULL;
+ nfs_readdir_page_init_and_validate(page, cookie, change_attr);
+ if (nfs_readdir_page_last_cookie(page) != cookie)
+ nfs_readdir_page_reinit_array(page, cookie, change_attr);
+ return page;
}
static inline
@@ -390,6 +473,25 @@ bool nfs_readdir_use_cookie(const struct file *filp)
return true;
}
+static void nfs_readdir_seek_next_array(struct nfs_cache_array *array,
+ struct nfs_readdir_descriptor *desc)
+{
+ if (array->page_full) {
+ desc->last_cookie = array->last_cookie;
+ desc->current_index += array->size;
+ desc->cache_entry_index = 0;
+ desc->page_index++;
+ } else
+ desc->last_cookie = nfs_readdir_array_index_cookie(array);
+}
+
+static void nfs_readdir_rewind_search(struct nfs_readdir_descriptor *desc)
+{
+ desc->current_index = 0;
+ desc->last_cookie = 0;
+ desc->page_index = 0;
+}
+
static int nfs_readdir_search_for_pos(struct nfs_cache_array *array,
struct nfs_readdir_descriptor *desc)
{
@@ -401,6 +503,7 @@ static int nfs_readdir_search_for_pos(struct nfs_cache_array *array,
if (diff >= array->size) {
if (array->page_is_eof)
goto out_eof;
+ nfs_readdir_seek_next_array(array, desc);
return -EAGAIN;
}
@@ -413,16 +516,6 @@ out_eof:
return -EBADCOOKIE;
}
-static bool
-nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi)
-{
- if (nfsi->cache_validity & (NFS_INO_INVALID_CHANGE |
- NFS_INO_INVALID_DATA))
- return false;
- smp_rmb();
- return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags);
-}
-
static bool nfs_readdir_array_cookie_in_range(struct nfs_cache_array *array,
u64 cookie)
{
@@ -439,8 +532,7 @@ static bool nfs_readdir_array_cookie_in_range(struct nfs_cache_array *array,
static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array,
struct nfs_readdir_descriptor *desc)
{
- int i;
- loff_t new_pos;
+ unsigned int i;
int status = -EAGAIN;
if (!nfs_readdir_array_cookie_in_range(array, desc->dir_cookie))
@@ -448,33 +540,10 @@ static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array,
for (i = 0; i < array->size; i++) {
if (array->array[i].cookie == desc->dir_cookie) {
- struct nfs_inode *nfsi = NFS_I(file_inode(desc->file));
-
- new_pos = desc->current_index + i;
- if (desc->attr_gencount != nfsi->attr_gencount ||
- !nfs_readdir_inode_mapping_valid(nfsi)) {
- desc->duped = 0;
- desc->attr_gencount = nfsi->attr_gencount;
- } else if (new_pos < desc->prev_index) {
- if (desc->duped > 0
- && desc->dup_cookie == desc->dir_cookie) {
- if (printk_ratelimit()) {
- pr_notice("NFS: directory %pD2 contains a readdir loop."
- "Please contact your server vendor. "
- "The file: %s has duplicate cookie %llu\n",
- desc->file, array->array[i].name, desc->dir_cookie);
- }
- status = -ELOOP;
- goto out;
- }
- desc->dup_cookie = desc->dir_cookie;
- desc->duped = -1;
- }
if (nfs_readdir_use_cookie(desc->file))
desc->ctx->pos = desc->dir_cookie;
else
- desc->ctx->pos = new_pos;
- desc->prev_index = new_pos;
+ desc->ctx->pos = desc->current_index + i;
desc->cache_entry_index = i;
return 0;
}
@@ -484,8 +553,8 @@ check_eof:
status = -EBADCOOKIE;
if (desc->dir_cookie == array->last_cookie)
desc->eof = true;
- }
-out:
+ } else
+ nfs_readdir_seek_next_array(array, desc);
return status;
}
@@ -501,11 +570,6 @@ static int nfs_readdir_search_array(struct nfs_readdir_descriptor *desc)
else
status = nfs_readdir_search_for_cookie(array, desc);
- if (status == -EAGAIN) {
- desc->last_cookie = array->last_cookie;
- desc->current_index += array->size;
- desc->page_index++;
- }
kunmap_atomic(array);
return status;
}
@@ -541,7 +605,6 @@ static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc,
/* We requested READDIRPLUS, but the server doesn't grok it */
if (error == -ENOTSUPP && desc->plus) {
NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS;
- clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
desc->plus = arg.plus = false;
goto again;
}
@@ -591,51 +654,68 @@ int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
return 1;
}
-static
-bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx)
+#define NFS_READDIR_CACHE_USAGE_THRESHOLD (8UL)
+
+static bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx,
+ unsigned int cache_hits,
+ unsigned int cache_misses)
{
if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS))
return false;
- if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags))
- return true;
- if (ctx->pos == 0)
+ if (ctx->pos == 0 ||
+ cache_hits + cache_misses > NFS_READDIR_CACHE_USAGE_THRESHOLD)
return true;
return false;
}
/*
- * This function is called by the lookup and getattr code to request the
+ * This function is called by the getattr code to request the
* use of readdirplus to accelerate any future lookups in the same
* directory.
*/
-void nfs_advise_use_readdirplus(struct inode *dir)
+void nfs_readdir_record_entry_cache_hit(struct inode *dir)
{
struct nfs_inode *nfsi = NFS_I(dir);
+ struct nfs_open_dir_context *ctx;
if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) &&
- !list_empty(&nfsi->open_files))
- set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
+ S_ISDIR(dir->i_mode)) {
+ rcu_read_lock();
+ list_for_each_entry_rcu (ctx, &nfsi->open_files, list)
+ atomic_inc(&ctx->cache_hits);
+ rcu_read_unlock();
+ }
}
/*
* This function is mainly for use by nfs_getattr().
*
* If this is an 'ls -l', we want to force use of readdirplus.
- * Do this by checking if there is an active file descriptor
- * and calling nfs_advise_use_readdirplus, then forcing a
- * cache flush.
*/
-void nfs_force_use_readdirplus(struct inode *dir)
+void nfs_readdir_record_entry_cache_miss(struct inode *dir)
{
struct nfs_inode *nfsi = NFS_I(dir);
+ struct nfs_open_dir_context *ctx;
if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) &&
- !list_empty(&nfsi->open_files)) {
- set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
- set_bit(NFS_INO_FORCE_READDIR, &nfsi->flags);
+ S_ISDIR(dir->i_mode)) {
+ rcu_read_lock();
+ list_for_each_entry_rcu (ctx, &nfsi->open_files, list)
+ atomic_inc(&ctx->cache_misses);
+ rcu_read_unlock();
}
}
+static void nfs_lookup_advise_force_readdirplus(struct inode *dir,
+ unsigned int flags)
+{
+ if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE))
+ return;
+ if (flags & (LOOKUP_EXCL | LOOKUP_PARENT | LOOKUP_REVAL))
+ return;
+ nfs_readdir_record_entry_cache_miss(dir);
+}
+
static
void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry,
unsigned long dir_verifier)
@@ -686,8 +766,12 @@ again:
status = nfs_refresh_inode(d_inode(dentry), entry->fattr);
if (!status)
nfs_setsecurity(d_inode(dentry), entry->fattr);
+ trace_nfs_readdir_lookup_revalidate(d_inode(parent),
+ dentry, 0, status);
goto out;
} else {
+ trace_nfs_readdir_lookup_revalidate_failed(
+ d_inode(parent), dentry, 0);
d_invalidate(dentry);
dput(dentry);
dentry = NULL;
@@ -709,22 +793,38 @@ again:
dentry = alias;
}
nfs_set_verifier(dentry, dir_verifier);
+ trace_nfs_readdir_lookup(d_inode(parent), dentry, 0);
out:
dput(dentry);
}
+static int nfs_readdir_entry_decode(struct nfs_readdir_descriptor *desc,
+ struct nfs_entry *entry,
+ struct xdr_stream *stream)
+{
+ int ret;
+
+ if (entry->fattr->label)
+ entry->fattr->label->len = NFS4_MAXLABELLEN;
+ ret = xdr_decode(desc, entry, stream);
+ if (ret || !desc->plus)
+ return ret;
+ nfs_prime_dcache(file_dentry(desc->file), entry, desc->dir_verifier);
+ return 0;
+}
+
/* Perform conversion from xdr to cache array */
static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
struct nfs_entry *entry,
- struct page **xdr_pages,
- unsigned int buflen,
- struct page **arrays,
- size_t narrays)
+ struct page **xdr_pages, unsigned int buflen,
+ struct page **arrays, size_t narrays,
+ u64 change_attr)
{
struct address_space *mapping = desc->file->f_mapping;
struct xdr_stream stream;
struct xdr_buf buf;
struct page *scratch, *new, *page = *arrays;
+ u64 cookie;
int status;
scratch = alloc_page(GFP_KERNEL);
@@ -735,54 +835,50 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
xdr_set_scratch_page(&stream, scratch);
do {
- if (entry->fattr->label)
- entry->fattr->label->len = NFS4_MAXLABELLEN;
-
- status = xdr_decode(desc, entry, &stream);
+ status = nfs_readdir_entry_decode(desc, entry, &stream);
if (status != 0)
break;
- if (desc->plus)
- nfs_prime_dcache(file_dentry(desc->file), entry,
- desc->dir_verifier);
-
- status = nfs_readdir_add_to_array(entry, page);
+ status = nfs_readdir_page_array_append(page, entry, &cookie);
if (status != -ENOSPC)
continue;
if (page->mapping != mapping) {
if (!--narrays)
break;
- new = nfs_readdir_page_array_alloc(entry->prev_cookie,
- GFP_KERNEL);
+ new = nfs_readdir_page_array_alloc(cookie, GFP_KERNEL);
if (!new)
break;
arrays++;
*arrays = page = new;
} else {
- new = nfs_readdir_page_get_next(mapping,
- page->index + 1,
- entry->prev_cookie);
+ new = nfs_readdir_page_get_next(mapping, cookie,
+ change_attr);
if (!new)
break;
if (page != *arrays)
nfs_readdir_page_unlock_and_put(page);
page = new;
}
- status = nfs_readdir_add_to_array(entry, page);
+ desc->page_index_max++;
+ status = nfs_readdir_page_array_append(page, entry, &cookie);
} while (!status && !entry->eof);
switch (status) {
case -EBADCOOKIE:
- if (entry->eof) {
- nfs_readdir_page_set_eof(page);
- status = 0;
- }
- break;
- case -ENOSPC:
+ if (!entry->eof)
+ break;
+ nfs_readdir_page_set_eof(page);
+ fallthrough;
case -EAGAIN:
status = 0;
break;
+ case -ENOSPC:
+ status = 0;
+ if (!desc->plus)
+ break;
+ while (!nfs_readdir_entry_decode(desc, entry, &stream))
+ ;
}
if (page != *arrays)
@@ -828,12 +924,14 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
__be32 *verf_arg, __be32 *verf_res,
struct page **arrays, size_t narrays)
{
+ u64 change_attr;
struct page **pages;
struct page *page = *arrays;
struct nfs_entry *entry;
size_t array_size;
struct inode *inode = file_inode(desc->file);
- size_t dtsize = NFS_SERVER(inode)->dtsize;
+ unsigned int dtsize = desc->dtsize;
+ unsigned int pglen;
int status = -ENOMEM;
entry = kzalloc(sizeof(*entry), GFP_KERNEL);
@@ -851,27 +949,21 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
if (!pages)
goto out;
- do {
- unsigned int pglen;
- status = nfs_readdir_xdr_filler(desc, verf_arg, entry->cookie,
- pages, dtsize,
- verf_res);
- if (status < 0)
- break;
-
- pglen = status;
- if (pglen == 0) {
- nfs_readdir_page_set_eof(page);
- break;
- }
-
- verf_arg = verf_res;
+ change_attr = inode_peek_iversion_raw(inode);
+ status = nfs_readdir_xdr_filler(desc, verf_arg, entry->cookie, pages,
+ dtsize, verf_res);
+ if (status < 0)
+ goto free_pages;
+ pglen = status;
+ if (pglen != 0)
status = nfs_readdir_page_filler(desc, entry, pages, pglen,
- arrays, narrays);
- } while (!status && nfs_readdir_page_needs_filling(page) &&
- page_mapping(page));
+ arrays, narrays, change_attr);
+ else
+ nfs_readdir_page_set_eof(page);
+ desc->buffer_fills++;
+free_pages:
nfs_readdir_free_pages(pages, array_size);
out:
nfs_free_fattr(entry->fattr);
@@ -896,9 +988,17 @@ nfs_readdir_page_unlock_and_put_cached(struct nfs_readdir_descriptor *desc)
static struct page *
nfs_readdir_page_get_cached(struct nfs_readdir_descriptor *desc)
{
- return nfs_readdir_page_get_locked(desc->file->f_mapping,
- desc->page_index,
- desc->last_cookie);
+ struct address_space *mapping = desc->file->f_mapping;
+ u64 change_attr = inode_peek_iversion_raw(mapping->host);
+ u64 cookie = desc->last_cookie;
+ struct page *page;
+
+ page = nfs_readdir_page_get_locked(mapping, cookie, change_attr);
+ if (!page)
+ return NULL;
+ if (desc->clear_cache && !nfs_readdir_page_needs_filling(page))
+ nfs_readdir_page_reinit_array(page, cookie, change_attr);
+ return page;
}
/*
@@ -916,13 +1016,23 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
if (!desc->page)
return -ENOMEM;
if (nfs_readdir_page_needs_filling(desc->page)) {
+ /* Grow the dtsize if we had to go back for more pages */
+ if (desc->page_index == desc->page_index_max)
+ nfs_grow_dtsize(desc);
+ desc->page_index_max = desc->page_index;
+ trace_nfs_readdir_cache_fill(desc->file, nfsi->cookieverf,
+ desc->last_cookie,
+ desc->page->index, desc->dtsize);
res = nfs_readdir_xdr_to_array(desc, nfsi->cookieverf, verf,
&desc->page, 1);
if (res < 0) {
nfs_readdir_page_unlock_and_put_cached(desc);
+ trace_nfs_readdir_cache_fill_done(inode, res);
if (res == -EBADCOOKIE || res == -ENOTSYNC) {
invalidate_inode_pages2(desc->file->f_mapping);
- desc->page_index = 0;
+ nfs_readdir_rewind_search(desc);
+ trace_nfs_readdir_invalidate_cache_range(
+ inode, 0, MAX_LFS_FILESIZE);
return -EAGAIN;
}
return res;
@@ -930,9 +1040,16 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
/*
* Set the cookie verifier if the page cache was empty
*/
- if (desc->page_index == 0)
+ if (desc->last_cookie == 0 &&
+ memcmp(nfsi->cookieverf, verf, sizeof(nfsi->cookieverf))) {
memcpy(nfsi->cookieverf, verf,
sizeof(nfsi->cookieverf));
+ invalidate_inode_pages2_range(desc->file->f_mapping, 1,
+ -1);
+ trace_nfs_readdir_invalidate_cache_range(
+ inode, 1, MAX_LFS_FILESIZE);
+ }
+ desc->clear_cache = false;
}
res = nfs_readdir_search_array(desc);
if (res == 0)
@@ -941,34 +1058,12 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
return res;
}
-static bool nfs_readdir_dont_search_cache(struct nfs_readdir_descriptor *desc)
-{
- struct address_space *mapping = desc->file->f_mapping;
- struct inode *dir = file_inode(desc->file);
- unsigned int dtsize = NFS_SERVER(dir)->dtsize;
- loff_t size = i_size_read(dir);
-
- /*
- * Default to uncached readdir if the page cache is empty, and
- * we're looking for a non-zero cookie in a large directory.
- */
- return desc->dir_cookie != 0 && mapping->nrpages == 0 && size > dtsize;
-}
-
/* Search for desc->dir_cookie from the beginning of the page cache */
static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc)
{
int res;
- if (nfs_readdir_dont_search_cache(desc))
- return -EBADCOOKIE;
-
do {
- if (desc->page_index == 0) {
- desc->current_index = 0;
- desc->prev_index = 0;
- desc->last_cookie = 0;
- }
res = find_and_lock_cache_page(desc);
} while (res == -EAGAIN);
return res;
@@ -982,7 +1077,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
{
struct file *file = desc->file;
struct nfs_cache_array *array;
- unsigned int i = 0;
+ unsigned int i;
array = kmap(desc->page);
for (i = desc->cache_entry_index; i < array->size; i++) {
@@ -995,16 +1090,17 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
break;
}
memcpy(desc->verf, verf, sizeof(desc->verf));
- if (i < (array->size-1))
- desc->dir_cookie = array->array[i+1].cookie;
- else
+ if (i == array->size - 1) {
desc->dir_cookie = array->last_cookie;
+ nfs_readdir_seek_next_array(array, desc);
+ } else {
+ desc->dir_cookie = array->array[i + 1].cookie;
+ desc->last_cookie = array->array[0].cookie;
+ }
if (nfs_readdir_use_cookie(file))
desc->ctx->pos = desc->dir_cookie;
else
desc->ctx->pos++;
- if (desc->duped != 0)
- desc->duped = 1;
}
if (array->page_is_eof)
desc->eof = !desc->eob;
@@ -1046,9 +1142,16 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)
desc->page_index = 0;
desc->cache_entry_index = 0;
desc->last_cookie = desc->dir_cookie;
- desc->duped = 0;
+ desc->page_index_max = 0;
+
+ trace_nfs_readdir_uncached(desc->file, desc->verf, desc->last_cookie,
+ -1, desc->dtsize);
status = nfs_readdir_xdr_to_array(desc, desc->verf, verf, arrays, sz);
+ if (status < 0) {
+ trace_nfs_readdir_uncached_done(file_inode(desc->file), status);
+ goto out_free;
+ }
for (i = 0; !desc->eob && i < sz && arrays[i]; i++) {
desc->page = arrays[i];
@@ -1056,15 +1159,44 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)
}
desc->page = NULL;
-
+ /*
+ * Grow the dtsize if we have to go back for more pages,
+ * or shrink it if we're reading too many.
+ */
+ if (!desc->eof) {
+ if (!desc->eob)
+ nfs_grow_dtsize(desc);
+ else if (desc->buffer_fills == 1 &&
+ i < (desc->page_index_max >> 1))
+ nfs_shrink_dtsize(desc);
+ }
+out_free:
for (i = 0; i < sz && arrays[i]; i++)
nfs_readdir_page_array_free(arrays[i]);
out:
+ if (!nfs_readdir_use_cookie(desc->file))
+ nfs_readdir_rewind_search(desc);
+ desc->page_index_max = -1;
kfree(arrays);
dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status);
return status;
}
+#define NFS_READDIR_CACHE_MISS_THRESHOLD (16UL)
+
+static bool nfs_readdir_handle_cache_misses(struct inode *inode,
+ struct nfs_readdir_descriptor *desc,
+ unsigned int cache_misses,
+ bool force_clear)
+{
+ if (desc->ctx->pos == 0 || !desc->plus)
+ return false;
+ if (cache_misses <= NFS_READDIR_CACHE_MISS_THRESHOLD && !force_clear)
+ return false;
+ trace_nfs_readdir_force_readdirplus(inode);
+ return true;
+}
+
/* The file offset position represents the dirent entry number. A
last cookie cache takes care of the common case of reading the
whole directory.
@@ -1076,7 +1208,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_open_dir_context *dir_ctx = file->private_data;
struct nfs_readdir_descriptor *desc;
- pgoff_t page_index;
+ unsigned int cache_hits, cache_misses;
+ bool force_clear;
int res;
dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n",
@@ -1089,11 +1222,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
* to either find the entry with the appropriate number or
* revalidate the cookie.
*/
- if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) {
- res = nfs_revalidate_mapping(inode, file->f_mapping);
- if (res < 0)
- goto out;
- }
+ nfs_revalidate_mapping(inode, file->f_mapping);
res = -ENOMEM;
desc = kzalloc(sizeof(*desc), GFP_KERNEL);
@@ -1101,16 +1230,19 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
goto out;
desc->file = file;
desc->ctx = ctx;
- desc->plus = nfs_use_readdirplus(inode, ctx);
+ desc->page_index_max = -1;
spin_lock(&file->f_lock);
desc->dir_cookie = dir_ctx->dir_cookie;
- desc->dup_cookie = dir_ctx->dup_cookie;
- desc->duped = dir_ctx->duped;
- page_index = dir_ctx->page_index;
+ desc->page_index = dir_ctx->page_index;
+ desc->last_cookie = dir_ctx->last_cookie;
desc->attr_gencount = dir_ctx->attr_gencount;
desc->eof = dir_ctx->eof;
+ nfs_set_dtsize(desc, dir_ctx->dtsize);
memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf));
+ cache_hits = atomic_xchg(&dir_ctx->cache_hits, 0);
+ cache_misses = atomic_xchg(&dir_ctx->cache_misses, 0);
+ force_clear = dir_ctx->force_clear;
spin_unlock(&file->f_lock);
if (desc->eof) {
@@ -1118,9 +1250,10 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
goto out_free;
}
- if (test_and_clear_bit(NFS_INO_FORCE_READDIR, &nfsi->flags) &&
- list_is_singular(&nfsi->open_files))
- invalidate_mapping_pages(inode->i_mapping, page_index + 1, -1);
+ desc->plus = nfs_use_readdirplus(inode, ctx, cache_hits, cache_misses);
+ force_clear = nfs_readdir_handle_cache_misses(inode, desc, cache_misses,
+ force_clear);
+ desc->clear_cache = force_clear;
do {
res = readdir_search_pagecache(desc);
@@ -1139,9 +1272,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
break;
}
if (res == -ETOOSMALL && desc->plus) {
- clear_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
nfs_zap_caches(inode);
- desc->page_index = 0;
desc->plus = false;
desc->eof = false;
continue;
@@ -1151,15 +1282,18 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
nfs_do_filldir(desc, nfsi->cookieverf);
nfs_readdir_page_unlock_and_put_cached(desc);
+ if (desc->page_index == desc->page_index_max)
+ desc->clear_cache = force_clear;
} while (!desc->eob && !desc->eof);
spin_lock(&file->f_lock);
dir_ctx->dir_cookie = desc->dir_cookie;
- dir_ctx->dup_cookie = desc->dup_cookie;
- dir_ctx->duped = desc->duped;
+ dir_ctx->last_cookie = desc->last_cookie;
dir_ctx->attr_gencount = desc->attr_gencount;
dir_ctx->page_index = desc->page_index;
+ dir_ctx->force_clear = force_clear;
dir_ctx->eof = desc->eof;
+ dir_ctx->dtsize = desc->dtsize;
memcpy(dir_ctx->verf, desc->verf, sizeof(dir_ctx->verf));
spin_unlock(&file->f_lock);
out_free:
@@ -1197,13 +1331,14 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
}
if (offset != filp->f_pos) {
filp->f_pos = offset;
- if (nfs_readdir_use_cookie(filp))
- dir_ctx->dir_cookie = offset;
- else
+ dir_ctx->page_index = 0;
+ if (!nfs_readdir_use_cookie(filp)) {
dir_ctx->dir_cookie = 0;
- if (offset == 0)
- memset(dir_ctx->verf, 0, sizeof(dir_ctx->verf));
- dir_ctx->duped = 0;
+ dir_ctx->last_cookie = 0;
+ } else {
+ dir_ctx->dir_cookie = offset;
+ dir_ctx->last_cookie = offset;
+ }
dir_ctx->eof = false;
}
spin_unlock(&filp->f_lock);
@@ -1419,7 +1554,12 @@ int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags)
if (flags & LOOKUP_REVAL)
goto out_force;
out:
- return (inode->i_nlink == 0) ? -ESTALE : 0;
+ if (inode->i_nlink > 0 ||
+ (inode->i_nlink == 0 &&
+ test_bit(NFS_INO_PRESERVE_UNLINKED, &NFS_I(inode)->flags)))
+ return 0;
+ else
+ return -ESTALE;
out_force:
if (flags & LOOKUP_RCU)
return -ECHILD;
@@ -1469,9 +1609,7 @@ nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry,
{
switch (error) {
case 1:
- dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n",
- __func__, dentry);
- return 1;
+ break;
case 0:
/*
* We can't d_drop the root of a disconnected tree:
@@ -1480,13 +1618,10 @@ nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry,
* inodes on unmount and further oopses.
*/
if (inode && IS_ROOT(dentry))
- return 1;
- dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n",
- __func__, dentry);
- return 0;
+ error = 1;
+ break;
}
- dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) lookup returned error %d\n",
- __func__, dentry, error);
+ trace_nfs_lookup_revalidate_exit(dir, dentry, 0, error);
return error;
}
@@ -1511,15 +1646,17 @@ nfs_lookup_revalidate_delegated(struct inode *dir, struct dentry *dentry,
return nfs_lookup_revalidate_done(dir, dentry, inode, 1);
}
-static int
-nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry,
- struct inode *inode)
+static int nfs_lookup_revalidate_dentry(struct inode *dir,
+ struct dentry *dentry,
+ struct inode *inode, unsigned int flags)
{
struct nfs_fh *fhandle;
struct nfs_fattr *fattr;
unsigned long dir_verifier;
int ret;
+ trace_nfs_lookup_revalidate_enter(dir, dentry, flags);
+
ret = -ENOMEM;
fhandle = nfs_alloc_fhandle();
fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode));
@@ -1540,6 +1677,10 @@ nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry,
}
goto out;
}
+
+ /* Request help from readdirplus */
+ nfs_lookup_advise_force_readdirplus(dir, flags);
+
ret = 0;
if (nfs_compare_fh(NFS_FH(inode), fhandle))
goto out;
@@ -1549,8 +1690,6 @@ nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry,
nfs_setsecurity(inode, fattr);
nfs_set_verifier(dentry, dir_verifier);
- /* set a readdirplus hint that we had a cache miss */
- nfs_force_use_readdirplus(dir);
ret = 1;
out:
nfs_free_fattr(fattr);
@@ -1607,7 +1746,6 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
nfs_mark_dir_for_revalidate(dir);
goto out_bad;
}
- nfs_advise_use_readdirplus(dir);
goto out_valid;
}
@@ -1617,10 +1755,7 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
if (NFS_STALE(inode))
goto out_bad;
- trace_nfs_lookup_revalidate_enter(dir, dentry, flags);
- error = nfs_lookup_revalidate_dentry(dir, dentry, inode);
- trace_nfs_lookup_revalidate_exit(dir, dentry, flags, error);
- return error;
+ return nfs_lookup_revalidate_dentry(dir, dentry, inode, flags);
out_valid:
return nfs_lookup_revalidate_done(dir, dentry, inode, 1);
out_bad:
@@ -1814,7 +1949,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
goto out;
/* Notify readdir to use READDIRPLUS */
- nfs_force_use_readdirplus(dir);
+ nfs_lookup_advise_force_readdirplus(dir, flags);
no_entry:
res = d_splice_alias(inode, dentry);
@@ -1853,16 +1988,6 @@ const struct dentry_operations nfs4_dentry_operations = {
};
EXPORT_SYMBOL_GPL(nfs4_dentry_operations);
-static fmode_t flags_to_mode(int flags)
-{
- fmode_t res = (__force fmode_t)flags & FMODE_EXEC;
- if ((flags & O_ACCMODE) != O_WRONLY)
- res |= FMODE_READ;
- if ((flags & O_ACCMODE) != O_RDONLY)
- res |= FMODE_WRITE;
- return res;
-}
-
static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags, struct file *filp)
{
return alloc_nfs_open_context(dentry, flags_to_mode(open_flags), filp);
@@ -2077,7 +2202,7 @@ nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
reval_dentry:
if (flags & LOOKUP_RCU)
return -ECHILD;
- return nfs_lookup_revalidate_dentry(dir, dentry, inode);
+ return nfs_lookup_revalidate_dentry(dir, dentry, inode, flags);
full_reval:
return nfs_do_lookup_revalidate(dir, dentry, flags);
@@ -2330,7 +2455,8 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
trace_nfs_unlink_enter(dir, dentry);
spin_lock(&dentry->d_lock);
- if (d_count(dentry) > 1) {
+ if (d_count(dentry) > 1 && !test_bit(NFS_INO_PRESERVE_UNLINKED,
+ &NFS_I(d_inode(dentry))->flags)) {
spin_unlock(&dentry->d_lock);
/* Start asynchronous writeout of the inode */
write_inode_now(d_inode(dentry), 0);
@@ -2989,11 +3115,8 @@ static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask)
/*
* Determine which access bits we want to ask for...
*/
- cache.mask = NFS_ACCESS_READ | NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND;
- if (nfs_server_capable(inode, NFS_CAP_XATTR)) {
- cache.mask |= NFS_ACCESS_XAREAD | NFS_ACCESS_XAWRITE |
- NFS_ACCESS_XALIST;
- }
+ cache.mask = NFS_ACCESS_READ | NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND |
+ nfs_access_xattr_mask(NFS_SERVER(inode));
if (S_ISDIR(inode->i_mode))
cache.mask |= NFS_ACCESS_DELETE | NFS_ACCESS_LOOKUP;
else
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index eabfdab543c8..11c566d8769f 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -173,8 +173,8 @@ ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
if (iov_iter_rw(iter) == READ)
- return nfs_file_direct_read(iocb, iter);
- return nfs_file_direct_write(iocb, iter);
+ return nfs_file_direct_read(iocb, iter, true);
+ return nfs_file_direct_write(iocb, iter, true);
}
static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
@@ -425,6 +425,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
* nfs_file_direct_read - file direct read operation for NFS files
* @iocb: target I/O control block
* @iter: vector of user buffers into which to read data
+ * @swap: flag indicating this is swap IO, not O_DIRECT IO
*
* We use this function for direct reads instead of calling
* generic_file_aio_read() in order to avoid gfar's check to see if
@@ -440,7 +441,8 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
* client must read the updated atime from the server back into its
* cache.
*/
-ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
+ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
+ bool swap)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -482,12 +484,14 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
if (iter_is_iovec(iter))
dreq->flags = NFS_ODIRECT_SHOULD_DIRTY;
- nfs_start_io_direct(inode);
+ if (!swap)
+ nfs_start_io_direct(inode);
NFS_I(inode)->read_io += count;
requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
- nfs_end_io_direct(inode);
+ if (!swap)
+ nfs_end_io_direct(inode);
if (requested > 0) {
result = nfs_direct_wait(dreq);
@@ -790,7 +794,7 @@ static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
*/
static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
struct iov_iter *iter,
- loff_t pos)
+ loff_t pos, int ioflags)
{
struct nfs_pageio_descriptor desc;
struct inode *inode = dreq->inode;
@@ -798,7 +802,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
size_t requested_bytes = 0;
size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE);
- nfs_pageio_init_write(&desc, inode, FLUSH_COND_STABLE, false,
+ nfs_pageio_init_write(&desc, inode, ioflags, false,
&nfs_direct_write_completion_ops);
desc.pg_dreq = dreq;
get_dreq(dreq);
@@ -876,6 +880,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
* nfs_file_direct_write - file direct write operation for NFS files
* @iocb: target I/O control block
* @iter: vector of user buffers from which to write data
+ * @swap: flag indicating this is swap IO, not O_DIRECT IO
*
* We use this function for direct writes instead of calling
* generic_file_aio_write() in order to avoid taking the inode
@@ -892,7 +897,8 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
* Note that O_APPEND is not supported for NFS direct writes, as there
* is no atomic O_APPEND write facility in the NFS protocol.
*/
-ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
+ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
+ bool swap)
{
ssize_t result, requested;
size_t count;
@@ -906,7 +912,11 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
file, iov_iter_count(iter), (long long) iocb->ki_pos);
- result = generic_write_checks(iocb, iter);
+ if (swap)
+ /* bypass generic checks */
+ result = iov_iter_count(iter);
+ else
+ result = generic_write_checks(iocb, iter);
if (result <= 0)
return result;
count = result;
@@ -937,16 +947,22 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
dreq->iocb = iocb;
pnfs_init_ds_commit_info_ops(&dreq->ds_cinfo, inode);
- nfs_start_io_direct(inode);
+ if (swap) {
+ requested = nfs_direct_write_schedule_iovec(dreq, iter, pos,
+ FLUSH_STABLE);
+ } else {
+ nfs_start_io_direct(inode);
- requested = nfs_direct_write_schedule_iovec(dreq, iter, pos);
+ requested = nfs_direct_write_schedule_iovec(dreq, iter, pos,
+ FLUSH_COND_STABLE);
- if (mapping->nrpages) {
- invalidate_inode_pages2_range(mapping,
- pos >> PAGE_SHIFT, end);
- }
+ if (mapping->nrpages) {
+ invalidate_inode_pages2_range(mapping,
+ pos >> PAGE_SHIFT, end);
+ }
- nfs_end_io_direct(inode);
+ nfs_end_io_direct(inode);
+ }
if (requested > 0) {
result = nfs_direct_wait(dreq);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 76d76acbc594..150b7fa8f0a7 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -44,11 +44,6 @@
static const struct vm_operations_struct nfs_file_vm_ops;
-/* Hack for future NFS swap support */
-#ifndef IS_SWAPFILE
-# define IS_SWAPFILE(inode) (0)
-#endif
-
int nfs_check_flags(int flags)
{
if ((flags & (O_APPEND | O_DIRECT)) == (O_APPEND | O_DIRECT))
@@ -162,7 +157,7 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
ssize_t result;
if (iocb->ki_flags & IOCB_DIRECT)
- return nfs_file_direct_read(iocb, to);
+ return nfs_file_direct_read(iocb, to, false);
dprintk("NFS: read(%pD2, %zu@%lu)\n",
iocb->ki_filp,
@@ -406,17 +401,17 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
* - Called if either PG_private or PG_fscache is set on the page
* - Caller holds page lock
*/
-static void nfs_invalidate_page(struct page *page, unsigned int offset,
- unsigned int length)
+static void nfs_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %u, %u)\n",
- page, offset, length);
+ dfprintk(PAGECACHE, "NFS: invalidate_folio(%lu, %zu, %zu)\n",
+ folio->index, offset, length);
- if (offset != 0 || length < PAGE_SIZE)
+ if (offset != 0 || length < folio_size(folio))
return;
/* Cancel any unstarted writes on this page */
- nfs_wb_page_cancel(page_file_mapping(page)->host, page);
- wait_on_page_fscache(page);
+ nfs_wb_folio_cancel(folio->mapping->host, folio);
+ folio_wait_fscache(folio);
}
/*
@@ -472,15 +467,15 @@ static void nfs_check_dirty_writeback(struct page *page,
* - Caller holds page lock
* - Return 0 if successful, -error otherwise
*/
-static int nfs_launder_page(struct page *page)
+static int nfs_launder_folio(struct folio *folio)
{
- struct inode *inode = page_file_mapping(page)->host;
+ struct inode *inode = folio->mapping->host;
- dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n",
- inode->i_ino, (long long)page_offset(page));
+ dfprintk(PAGECACHE, "NFS: launder_folio(%ld, %llu)\n",
+ inode->i_ino, folio_pos(folio));
- wait_on_page_fscache(page);
- return nfs_wb_page(inode, page);
+ folio_wait_fscache(folio);
+ return nfs_wb_page(inode, &folio->page);
}
static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
@@ -488,8 +483,9 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
{
unsigned long blocks;
long long isize;
- struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
- struct inode *inode = file->f_mapping->host;
+ struct inode *inode = file_inode(file);
+ struct rpc_clnt *clnt = NFS_CLIENT(inode);
+ struct nfs_client *cl = NFS_SERVER(inode)->nfs_client;
spin_lock(&inode->i_lock);
blocks = inode->i_blocks;
@@ -502,31 +498,39 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
*span = sis->pages;
+
+ if (cl->rpc_ops->enable_swap)
+ cl->rpc_ops->enable_swap(inode);
+
return rpc_clnt_swap_activate(clnt);
}
static void nfs_swap_deactivate(struct file *file)
{
- struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
+ struct inode *inode = file_inode(file);
+ struct rpc_clnt *clnt = NFS_CLIENT(inode);
+ struct nfs_client *cl = NFS_SERVER(inode)->nfs_client;
rpc_clnt_swap_deactivate(clnt);
+ if (cl->rpc_ops->disable_swap)
+ cl->rpc_ops->disable_swap(file_inode(file));
}
const struct address_space_operations nfs_file_aops = {
.readpage = nfs_readpage,
- .readpages = nfs_readpages,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .readahead = nfs_readahead,
+ .dirty_folio = filemap_dirty_folio,
.writepage = nfs_writepage,
.writepages = nfs_writepages,
.write_begin = nfs_write_begin,
.write_end = nfs_write_end,
- .invalidatepage = nfs_invalidate_page,
+ .invalidate_folio = nfs_invalidate_folio,
.releasepage = nfs_release_page,
.direct_IO = nfs_direct_IO,
#ifdef CONFIG_MIGRATION
.migratepage = nfs_migrate_page,
#endif
- .launder_page = nfs_launder_page,
+ .launder_folio = nfs_launder_folio,
.is_dirty_writeback = nfs_check_dirty_writeback,
.error_remove_page = generic_error_remove_page,
.swap_activate = nfs_swap_activate,
@@ -619,7 +623,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
return result;
if (iocb->ki_flags & IOCB_DIRECT)
- return nfs_file_direct_write(iocb, from);
+ return nfs_file_direct_write(iocb, from, false);
dprintk("NFS: write(%pD2, %zu@%Ld)\n",
file, iov_iter_count(from), (long long) iocb->ki_pos);
@@ -642,7 +646,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
result = generic_write_checks(iocb, from);
if (result > 0) {
current->backing_dev_info = inode_to_bdi(inode);
- result = generic_perform_write(file, from, iocb->ki_pos);
+ result = generic_perform_write(iocb, from);
current->backing_dev_info = NULL;
}
nfs_end_io_write(inode);
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 9c96e3e5ed35..76deddab0a8f 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -1075,7 +1075,7 @@ filelayout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
unsigned int size = (fl->stripe_type == STRIPE_SPARSE) ?
fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
- new = pnfs_alloc_commit_array(size, GFP_NOIO);
+ new = pnfs_alloc_commit_array(size, nfs_io_gfp_mask());
if (new) {
spin_lock(&inode->i_lock);
array = pnfs_add_commit_array(fl_cinfo, new, lseg);
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index a553d59afa8b..604be402ae13 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -663,7 +663,7 @@ nfs4_ff_layout_stat_io_start_read(struct inode *inode,
spin_unlock(&mirror->lock);
if (report)
- pnfs_report_layoutstat(inode, GFP_KERNEL);
+ pnfs_report_layoutstat(inode, nfs_io_gfp_mask());
}
static void
@@ -694,7 +694,7 @@ nfs4_ff_layout_stat_io_start_write(struct inode *inode,
spin_unlock(&mirror->lock);
if (report)
- pnfs_report_layoutstat(inode, GFP_NOIO);
+ pnfs_report_layoutstat(inode, nfs_io_gfp_mask());
}
static void
@@ -806,13 +806,10 @@ ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
bool strict_iomode)
{
pnfs_put_lseg(pgio->pg_lseg);
- pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
- nfs_req_openctx(req),
- req_offset(req),
- req->wb_bytes,
- IOMODE_READ,
- strict_iomode,
- GFP_KERNEL);
+ pgio->pg_lseg =
+ pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
+ req_offset(req), req->wb_bytes, IOMODE_READ,
+ strict_iomode, nfs_io_gfp_mask());
if (IS_ERR(pgio->pg_lseg)) {
pgio->pg_error = PTR_ERR(pgio->pg_lseg);
pgio->pg_lseg = NULL;
@@ -894,13 +891,10 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
retry:
ff_layout_pg_check_layout(pgio, req);
if (!pgio->pg_lseg) {
- pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
- nfs_req_openctx(req),
- req_offset(req),
- req->wb_bytes,
- IOMODE_RW,
- false,
- GFP_NOFS);
+ pgio->pg_lseg =
+ pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
+ req_offset(req), req->wb_bytes,
+ IOMODE_RW, false, nfs_io_gfp_mask());
if (IS_ERR(pgio->pg_lseg)) {
pgio->pg_error = PTR_ERR(pgio->pg_lseg);
pgio->pg_lseg = NULL;
@@ -953,13 +947,10 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
if (!pgio->pg_lseg) {
- pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
- nfs_req_openctx(req),
- req_offset(req),
- req->wb_bytes,
- IOMODE_RW,
- false,
- GFP_NOFS);
+ pgio->pg_lseg =
+ pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
+ req_offset(req), req->wb_bytes,
+ IOMODE_RW, false, nfs_io_gfp_mask());
if (IS_ERR(pgio->pg_lseg)) {
pgio->pg_error = PTR_ERR(pgio->pg_lseg);
pgio->pg_lseg = NULL;
@@ -1258,7 +1249,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
mirror = FF_LAYOUT_COMP(lseg, idx);
err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
mirror, offset, length, status, opnum,
- GFP_NOIO);
+ nfs_io_gfp_mask());
switch (status) {
case NFS4ERR_DELAY:
@@ -1973,7 +1964,8 @@ ff_layout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
struct inode *inode = lseg->pls_layout->plh_inode;
struct pnfs_commit_array *array, *new;
- new = pnfs_alloc_commit_array(flseg->mirror_array_cnt, GFP_NOIO);
+ new = pnfs_alloc_commit_array(flseg->mirror_array_cnt,
+ nfs_io_gfp_mask());
if (new) {
spin_lock(&inode->i_lock);
array = pnfs_add_commit_array(fl_cinfo, new, lseg);
@@ -2152,10 +2144,10 @@ ff_layout_prepare_layoutreturn(struct nfs4_layoutreturn_args *args)
struct nfs4_flexfile_layoutreturn_args *ff_args;
struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(args->layout);
- ff_args = kmalloc(sizeof(*ff_args), GFP_KERNEL);
+ ff_args = kmalloc(sizeof(*ff_args), nfs_io_gfp_mask());
if (!ff_args)
goto out_nomem;
- ff_args->pages[0] = alloc_page(GFP_KERNEL);
+ ff_args->pages[0] = alloc_page(nfs_io_gfp_mask());
if (!ff_args->pages[0])
goto out_nomem_free;
@@ -2192,8 +2184,8 @@ ff_layout_send_layouterror(struct pnfs_layout_segment *lseg)
if (list_empty(&head))
return;
- errors = kmalloc_array(NFS42_LAYOUTERROR_MAX,
- sizeof(*errors), GFP_NOFS);
+ errors = kmalloc_array(NFS42_LAYOUTERROR_MAX, sizeof(*errors),
+ nfs_io_gfp_mask());
if (errors != NULL) {
const struct nfs4_ff_layout_ds_err *pos;
size_t n = 0;
@@ -2444,7 +2436,8 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
const int dev_count = PNFS_LAYOUTSTATS_MAXDEV;
/* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */
- args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo), GFP_NOIO);
+ args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo),
+ nfs_io_gfp_mask());
if (!args->devinfo)
return -ENOMEM;
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index ea17fa1f31ec..e2d59bb5e6bb 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -80,6 +80,7 @@ enum nfs_param {
Opt_source,
Opt_tcp,
Opt_timeo,
+ Opt_trunkdiscovery,
Opt_udp,
Opt_v,
Opt_vers,
@@ -180,6 +181,7 @@ static const struct fs_parameter_spec nfs_fs_parameters[] = {
fsparam_string("source", Opt_source),
fsparam_flag ("tcp", Opt_tcp),
fsparam_u32 ("timeo", Opt_timeo),
+ fsparam_flag_no("trunkdiscovery", Opt_trunkdiscovery),
fsparam_flag ("udp", Opt_udp),
fsparam_flag ("v2", Opt_v),
fsparam_flag ("v3", Opt_v),
@@ -529,6 +531,12 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
else
ctx->flags &= ~NFS_MOUNT_NOCTO;
break;
+ case Opt_trunkdiscovery:
+ if (result.negated)
+ ctx->flags &= ~NFS_MOUNT_TRUNK_DISCOVERY;
+ else
+ ctx->flags |= NFS_MOUNT_TRUNK_DISCOVERY;
+ break;
case Opt_ac:
if (result.negated)
ctx->flags |= NFS_MOUNT_NOAC;
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index cfe901650ab0..f73c09a9cf0a 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -19,8 +19,7 @@
#include "internal.h"
#include "iostat.h"
#include "fscache.h"
-
-#define NFSDBG_FACILITY NFSDBG_FSCACHE
+#include "nfstrace.h"
#define NFS_MAX_KEY_LEN 1000
@@ -128,8 +127,6 @@ int nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int u
vcookie = fscache_acquire_volume(key,
NULL, /* preferred_cache */
NULL, 0 /* coherency_data */);
- dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n",
- nfss, vcookie);
if (IS_ERR(vcookie)) {
if (vcookie != ERR_PTR(-EBUSY)) {
kfree(key);
@@ -152,9 +149,6 @@ void nfs_fscache_release_super_cookie(struct super_block *sb)
{
struct nfs_server *nfss = NFS_SB(sb);
- dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n",
- nfss, nfss->fscache);
-
fscache_relinquish_volume(nfss->fscache, NULL, false);
nfss->fscache = NULL;
kfree(nfss->fscache_uniq);
@@ -173,7 +167,7 @@ void nfs_fscache_init_inode(struct inode *inode)
if (!(nfss->fscache && S_ISREG(inode->i_mode)))
return;
- nfs_fscache_update_auxdata(&auxdata, nfsi);
+ nfs_fscache_update_auxdata(&auxdata, inode);
nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache,
0,
@@ -181,7 +175,7 @@ void nfs_fscache_init_inode(struct inode *inode)
nfsi->fh.size,
&auxdata, /* aux_data */
sizeof(auxdata),
- i_size_read(&nfsi->vfs_inode));
+ i_size_read(inode));
}
/*
@@ -192,8 +186,6 @@ void nfs_fscache_clear_inode(struct inode *inode)
struct nfs_inode *nfsi = NFS_I(inode);
struct fscache_cookie *cookie = nfs_i_fscache(inode);
- dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie);
-
fscache_relinquish_cookie(cookie, false);
nfsi->fscache = NULL;
}
@@ -220,7 +212,6 @@ void nfs_fscache_clear_inode(struct inode *inode)
void nfs_fscache_open_file(struct inode *inode, struct file *filp)
{
struct nfs_fscache_inode_auxdata auxdata;
- struct nfs_inode *nfsi = NFS_I(inode);
struct fscache_cookie *cookie = nfs_i_fscache(inode);
bool open_for_write = inode_is_open_for_write(inode);
@@ -229,8 +220,7 @@ void nfs_fscache_open_file(struct inode *inode, struct file *filp)
fscache_use_cookie(cookie, open_for_write);
if (open_for_write) {
- dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi);
- nfs_fscache_update_auxdata(&auxdata, nfsi);
+ nfs_fscache_update_auxdata(&auxdata, inode);
fscache_invalidate(cookie, &auxdata, i_size_read(inode),
FSCACHE_INVAL_DIO_WRITE);
}
@@ -240,23 +230,14 @@ EXPORT_SYMBOL_GPL(nfs_fscache_open_file);
void nfs_fscache_release_file(struct inode *inode, struct file *filp)
{
struct nfs_fscache_inode_auxdata auxdata;
- struct nfs_inode *nfsi = NFS_I(inode);
struct fscache_cookie *cookie = nfs_i_fscache(inode);
if (fscache_cookie_valid(cookie)) {
- nfs_fscache_update_auxdata(&auxdata, nfsi);
+ nfs_fscache_update_auxdata(&auxdata, inode);
fscache_unuse_cookie(cookie, &auxdata, NULL);
}
}
-static inline void fscache_end_operation(struct netfs_cache_resources *cres)
-{
- const struct netfs_cache_ops *ops = fscache_operation_valid(cres);
-
- if (ops)
- ops->end_operation(cres);
-}
-
/*
* Fallback page reading interface.
*/
@@ -319,58 +300,50 @@ static int fscache_fallback_write_page(struct inode *inode, struct page *page,
/*
* Retrieve a page from fscache
*/
-int __nfs_readpage_from_fscache(struct inode *inode, struct page *page)
+int __nfs_fscache_read_page(struct inode *inode, struct page *page)
{
int ret;
- dfprintk(FSCACHE,
- "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n",
- nfs_i_fscache(inode), page, page->index, page->flags, inode);
-
+ trace_nfs_fscache_read_page(inode, page);
if (PageChecked(page)) {
- dfprintk(FSCACHE, "NFS: readpage_from_fscache: PageChecked\n");
ClearPageChecked(page);
- return 1;
+ ret = 1;
+ goto out;
}
ret = fscache_fallback_read_page(inode, page);
if (ret < 0) {
nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL);
- dfprintk(FSCACHE,
- "NFS: readpage_from_fscache failed %d\n", ret);
SetPageChecked(page);
- return ret;
+ goto out;
}
/* Read completed synchronously */
- dfprintk(FSCACHE, "NFS: readpage_from_fscache: read successful\n");
nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK);
SetPageUptodate(page);
- return 0;
+ ret = 0;
+out:
+ trace_nfs_fscache_read_page_exit(inode, page, ret);
+ return ret;
}
/*
* Store a newly fetched page in fscache. We can be certain there's no page
* stored in the cache as yet otherwise we would've read it from there.
*/
-void __nfs_readpage_to_fscache(struct inode *inode, struct page *page)
+void __nfs_fscache_write_page(struct inode *inode, struct page *page)
{
int ret;
- dfprintk(FSCACHE,
- "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx))\n",
- nfs_i_fscache(inode), page, page->index, page->flags);
+ trace_nfs_fscache_write_page(inode, page);
ret = fscache_fallback_write_page(inode, page, true);
- dfprintk(FSCACHE,
- "NFS: readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n",
- page, page->index, page->flags, ret);
-
if (ret != 0) {
nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL);
nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED);
} else {
nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_WRITTEN_OK);
}
+ trace_nfs_fscache_write_page_exit(inode, page, ret);
}
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 25a5c0f82392..4e980cc04779 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -45,10 +45,8 @@ extern void nfs_fscache_clear_inode(struct inode *);
extern void nfs_fscache_open_file(struct inode *, struct file *);
extern void nfs_fscache_release_file(struct inode *, struct file *);
-extern int __nfs_readpage_from_fscache(struct inode *, struct page *);
-extern void __nfs_read_completion_to_fscache(struct nfs_pgio_header *hdr,
- unsigned long bytes);
-extern void __nfs_readpage_to_fscache(struct inode *, struct page *);
+extern int __nfs_fscache_read_page(struct inode *, struct page *);
+extern void __nfs_fscache_write_page(struct inode *, struct page *);
static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
{
@@ -66,11 +64,10 @@ static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
/*
* Retrieve a page from an inode data storage object.
*/
-static inline int nfs_readpage_from_fscache(struct inode *inode,
- struct page *page)
+static inline int nfs_fscache_read_page(struct inode *inode, struct page *page)
{
- if (NFS_I(inode)->fscache)
- return __nfs_readpage_from_fscache(inode, page);
+ if (nfs_i_fscache(inode))
+ return __nfs_fscache_read_page(inode, page);
return -ENOBUFS;
}
@@ -78,24 +75,24 @@ static inline int nfs_readpage_from_fscache(struct inode *inode,
* Store a page newly fetched from the server in an inode data storage object
* in the cache.
*/
-static inline void nfs_readpage_to_fscache(struct inode *inode,
+static inline void nfs_fscache_write_page(struct inode *inode,
struct page *page)
{
- if (NFS_I(inode)->fscache)
- __nfs_readpage_to_fscache(inode, page);
+ if (nfs_i_fscache(inode))
+ __nfs_fscache_write_page(inode, page);
}
static inline void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *auxdata,
- struct nfs_inode *nfsi)
+ struct inode *inode)
{
memset(auxdata, 0, sizeof(*auxdata));
- auxdata->mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec;
- auxdata->mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec;
- auxdata->ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec;
- auxdata->ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec;
+ auxdata->mtime_sec = inode->i_mtime.tv_sec;
+ auxdata->mtime_nsec = inode->i_mtime.tv_nsec;
+ auxdata->ctime_sec = inode->i_ctime.tv_sec;
+ auxdata->ctime_nsec = inode->i_ctime.tv_nsec;
- if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
- auxdata->change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode);
+ if (NFS_SERVER(inode)->nfs_client->rpc_ops->version == 4)
+ auxdata->change_attr = inode_peek_iversion_raw(inode);
}
/*
@@ -107,9 +104,9 @@ static inline void nfs_fscache_invalidate(struct inode *inode, int flags)
struct nfs_inode *nfsi = NFS_I(inode);
if (nfsi->fscache) {
- nfs_fscache_update_auxdata(&auxdata, nfsi);
+ nfs_fscache_update_auxdata(&auxdata, inode);
fscache_invalidate(nfsi->fscache, &auxdata,
- i_size_read(&nfsi->vfs_inode), flags);
+ i_size_read(inode), flags);
}
}
@@ -136,15 +133,11 @@ static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
{
return 1; /* True: may release page */
}
-static inline int nfs_readpage_from_fscache(struct inode *inode,
- struct page *page)
+static inline int nfs_fscache_read_page(struct inode *inode, struct page *page)
{
return -ENOBUFS;
}
-static inline void nfs_readpage_to_fscache(struct inode *inode,
- struct page *page) {}
-
-
+static inline void nfs_fscache_write_page(struct inode *inode, struct page *page) {}
static inline void nfs_fscache_invalidate(struct inode *inode, int flags) {}
static inline const char *nfs_server_fscache_state(struct nfs_server *server)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index d96baa4450e3..b4e46b0ffa2d 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -203,14 +203,13 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
NFS_INO_INVALID_OTHER |
NFS_INO_INVALID_XATTR);
flags &= ~(NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE);
- } else if (flags & NFS_INO_REVAL_PAGECACHE)
- flags |= NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE;
+ }
if (!nfs_has_xattr_cache(nfsi))
flags &= ~NFS_INO_INVALID_XATTR;
if (flags & NFS_INO_INVALID_DATA)
nfs_fscache_invalidate(inode, 0);
- flags &= ~(NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED);
+ flags &= ~NFS_INO_REVAL_FORCED;
nfsi->cache_validity |= flags;
@@ -236,19 +235,17 @@ static void nfs_zap_caches_locked(struct inode *inode)
nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
nfsi->attrtimeo_timestamp = jiffies;
- if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
- nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
- | NFS_INO_INVALID_DATA
- | NFS_INO_INVALID_ACCESS
- | NFS_INO_INVALID_ACL
- | NFS_INO_INVALID_XATTR
- | NFS_INO_REVAL_PAGECACHE);
- } else
- nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
- | NFS_INO_INVALID_ACCESS
- | NFS_INO_INVALID_ACL
- | NFS_INO_INVALID_XATTR
- | NFS_INO_REVAL_PAGECACHE);
+ if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR |
+ NFS_INO_INVALID_DATA |
+ NFS_INO_INVALID_ACCESS |
+ NFS_INO_INVALID_ACL |
+ NFS_INO_INVALID_XATTR);
+ else
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR |
+ NFS_INO_INVALID_ACCESS |
+ NFS_INO_INVALID_ACL |
+ NFS_INO_INVALID_XATTR);
nfs_zap_label_cache_locked(nfsi);
}
@@ -564,8 +561,6 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
inode->i_gid = fattr->gid;
else if (fattr_supported & NFS_ATTR_FATTR_GROUP)
nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
- if (nfs_server_capable(inode, NFS_CAP_XATTR))
- nfs_set_cache_invalid(inode, NFS_INO_INVALID_XATTR);
if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
inode->i_blocks = fattr->du.nfs2.blocks;
else if (fattr_supported & NFS_ATTR_FATTR_BLOCKS_USED &&
@@ -785,26 +780,32 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
}
EXPORT_SYMBOL_GPL(nfs_setattr_update_inode);
-static void nfs_readdirplus_parent_cache_miss(struct dentry *dentry)
+/*
+ * Don't request help from readdirplus if the file is being written to,
+ * or if attribute caching is turned off
+ */
+static bool nfs_getattr_readdirplus_enable(const struct inode *inode)
{
- struct dentry *parent;
+ return nfs_server_capable(inode, NFS_CAP_READDIRPLUS) &&
+ !nfs_have_writebacks(inode) && NFS_MAXATTRTIMEO(inode) > 5 * HZ;
+}
- if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS))
- return;
- parent = dget_parent(dentry);
- nfs_force_use_readdirplus(d_inode(parent));
- dput(parent);
+static void nfs_readdirplus_parent_cache_miss(struct dentry *dentry)
+{
+ if (!IS_ROOT(dentry)) {
+ struct dentry *parent = dget_parent(dentry);
+ nfs_readdir_record_entry_cache_miss(d_inode(parent));
+ dput(parent);
+ }
}
static void nfs_readdirplus_parent_cache_hit(struct dentry *dentry)
{
- struct dentry *parent;
-
- if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS))
- return;
- parent = dget_parent(dentry);
- nfs_advise_use_readdirplus(d_inode(parent));
- dput(parent);
+ if (!IS_ROOT(dentry)) {
+ struct dentry *parent = dget_parent(dentry);
+ nfs_readdir_record_entry_cache_hit(d_inode(parent));
+ dput(parent);
+ }
}
static u32 nfs_get_valid_attrmask(struct inode *inode)
@@ -840,6 +841,7 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
int err = 0;
bool force_sync = query_flags & AT_STATX_FORCE_SYNC;
bool do_update = false;
+ bool readdirplus_enabled = nfs_getattr_readdirplus_enable(inode);
trace_nfs_getattr_enter(inode);
@@ -848,7 +850,8 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
STATX_INO | STATX_SIZE | STATX_BLOCKS;
if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) {
- nfs_readdirplus_parent_cache_hit(path->dentry);
+ if (readdirplus_enabled)
+ nfs_readdirplus_parent_cache_hit(path->dentry);
goto out_no_revalidate;
}
@@ -898,15 +901,12 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
do_update |= cache_validity & NFS_INO_INVALID_BLOCKS;
if (do_update) {
- /* Update the attribute cache */
- if (!(server->flags & NFS_MOUNT_NOAC))
+ if (readdirplus_enabled)
nfs_readdirplus_parent_cache_miss(path->dentry);
- else
- nfs_readdirplus_parent_cache_hit(path->dentry);
err = __nfs_revalidate_inode(server, inode);
if (err)
goto out;
- } else
+ } else if (readdirplus_enabled)
nfs_readdirplus_parent_cache_hit(path->dentry);
out_no_revalidate:
/* Only return attributes that were revalidated. */
@@ -952,7 +952,7 @@ struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)
res = __nfs_find_lock_context(ctx);
rcu_read_unlock();
if (res == NULL) {
- new = kmalloc(sizeof(*new), GFP_KERNEL);
+ new = kmalloc(sizeof(*new), GFP_KERNEL_ACCOUNT);
if (new == NULL)
return ERR_PTR(-ENOMEM);
nfs_init_lock_context(new);
@@ -1030,7 +1030,7 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry,
{
struct nfs_open_context *ctx;
- ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+ ctx = kmalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT);
if (!ctx)
return ERR_PTR(-ENOMEM);
nfs_sb_active(dentry->d_sb);
@@ -1180,7 +1180,6 @@ int nfs_open(struct inode *inode, struct file *filp)
nfs_fscache_open_file(inode, filp);
return 0;
}
-EXPORT_SYMBOL_GPL(nfs_open);
/*
* This function is called whenever some part of NFS notices that
@@ -1583,7 +1582,7 @@ struct nfs_fattr *nfs_alloc_fattr(void)
{
struct nfs_fattr *fattr;
- fattr = kmalloc(sizeof(*fattr), GFP_NOFS);
+ fattr = kmalloc(sizeof(*fattr), GFP_KERNEL);
if (fattr != NULL) {
nfs_fattr_init(fattr);
fattr->label = NULL;
@@ -1599,7 +1598,7 @@ struct nfs_fattr *nfs_alloc_fattr_with_label(struct nfs_server *server)
if (!fattr)
return NULL;
- fattr->label = nfs4_label_alloc(server, GFP_NOFS);
+ fattr->label = nfs4_label_alloc(server, GFP_KERNEL);
if (IS_ERR(fattr->label)) {
kfree(fattr);
return NULL;
@@ -1613,7 +1612,7 @@ struct nfs_fh *nfs_alloc_fhandle(void)
{
struct nfs_fh *fh;
- fh = kmalloc(sizeof(struct nfs_fh), GFP_NOFS);
+ fh = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL);
if (fh != NULL)
fh->size = 0;
return fh;
@@ -2238,7 +2237,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
struct inode *nfs_alloc_inode(struct super_block *sb)
{
struct nfs_inode *nfsi;
- nfsi = kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL);
+ nfsi = alloc_inode_sb(sb, nfs_inode_cachep, GFP_KERNEL);
if (!nfsi)
return NULL;
nfsi->flags = 0UL;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 2de7c56a1fbe..7eefa16ed381 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -42,6 +42,16 @@ static inline bool nfs_lookup_is_soft_revalidate(const struct dentry *dentry)
return true;
}
+static inline fmode_t flags_to_mode(int flags)
+{
+ fmode_t res = (__force fmode_t)flags & FMODE_EXEC;
+ if ((flags & O_ACCMODE) != O_WRONLY)
+ res |= FMODE_READ;
+ if ((flags & O_ACCMODE) != O_RDONLY)
+ res |= FMODE_WRITE;
+ return res;
+}
+
/*
* Note: RFC 1813 doesn't limit the number of auth flavors that
* a server can return, so make something up.
@@ -366,8 +376,8 @@ extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
const struct nfs_client_initdata *);
/* dir.c */
-extern void nfs_advise_use_readdirplus(struct inode *dir);
-extern void nfs_force_use_readdirplus(struct inode *dir);
+extern void nfs_readdir_record_entry_cache_hit(struct inode *dir);
+extern void nfs_readdir_record_entry_cache_miss(struct inode *dir);
extern unsigned long nfs_access_cache_count(struct shrinker *shrink,
struct shrink_control *sc);
extern unsigned long nfs_access_cache_scan(struct shrinker *shrink,
@@ -388,6 +398,20 @@ int nfs_mknod(struct user_namespace *, struct inode *, struct dentry *, umode_t,
int nfs_rename(struct user_namespace *, struct inode *, struct dentry *,
struct inode *, struct dentry *, unsigned int);
+#ifdef CONFIG_NFS_V4_2
+static inline __u32 nfs_access_xattr_mask(const struct nfs_server *server)
+{
+ if (!(server->caps & NFS_CAP_XATTR))
+ return 0;
+ return NFS4_ACCESS_XAREAD | NFS4_ACCESS_XAWRITE | NFS4_ACCESS_XALIST;
+}
+#else
+static inline __u32 nfs_access_xattr_mask(const struct nfs_server *server)
+{
+ return 0;
+}
+#endif
+
/* file.c */
int nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync);
loff_t nfs_file_llseek(struct file *, loff_t, int);
@@ -573,6 +597,13 @@ nfs_write_match_verf(const struct nfs_writeverf *verf,
!nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier);
}
+static inline gfp_t nfs_io_gfp_mask(void)
+{
+ if (current->flags & PF_WQ_WORKER)
+ return GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
+ return GFP_KERNEL;
+}
+
/* unlink.c */
extern struct rpc_task *
nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 7fba7711e6b3..05c3b4b2b3dd 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -949,13 +949,12 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
error = decode_filename_inline(xdr, &entry->name, &entry->len);
if (unlikely(error))
- return error;
+ return -EAGAIN;
/*
* The type (size and byte order) of nfscookie isn't defined in
* RFC 1094. This implementation assumes that it's an XDR uint32.
*/
- entry->prev_cookie = entry->cookie;
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
return -EAGAIN;
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 9274c9c5efea..3b0b650c9c5a 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1261,6 +1261,8 @@ static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req,
static void encode_readdirplus3args(struct xdr_stream *xdr,
const struct nfs3_readdirargs *args)
{
+ uint32_t dircount = args->count;
+ uint32_t maxcount = args->count;
__be32 *p;
encode_nfs_fh3(xdr, args->fh);
@@ -1273,9 +1275,8 @@ static void encode_readdirplus3args(struct xdr_stream *xdr,
* readdirplus: need dircount + buffer size.
* We just make sure we make dircount big enough
*/
- *p++ = cpu_to_be32(args->count >> 3);
-
- *p = cpu_to_be32(args->count);
+ *p++ = cpu_to_be32(dircount);
+ *p = cpu_to_be32(maxcount);
}
static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req,
@@ -1967,7 +1968,6 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
bool plus)
{
struct user_namespace *userns = rpc_userns(entry->server->client);
- struct nfs_entry old = *entry;
__be32 *p;
int error;
u64 new_cookie;
@@ -1987,15 +1987,15 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
error = decode_fileid3(xdr, &entry->ino);
if (unlikely(error))
- return error;
+ return -EAGAIN;
error = decode_inline_filename3(xdr, &entry->name, &entry->len);
if (unlikely(error))
- return error;
+ return -EAGAIN;
error = decode_cookie3(xdr, &new_cookie);
if (unlikely(error))
- return error;
+ return -EAGAIN;
entry->d_type = DT_UNKNOWN;
@@ -2003,7 +2003,7 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
entry->fattr->valid = 0;
error = decode_post_op_attr(xdr, entry->fattr, userns);
if (unlikely(error))
- return error;
+ return -EAGAIN;
if (entry->fattr->valid & NFS_ATTR_FATTR_V3)
entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
@@ -2018,24 +2018,15 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
return -EAGAIN;
if (*p != xdr_zero) {
error = decode_nfs_fh3(xdr, entry->fh);
- if (unlikely(error)) {
- if (error == -E2BIG)
- goto out_truncated;
- return error;
- }
+ if (unlikely(error))
+ return -EAGAIN;
} else
zero_nfs_fh3(entry->fh);
}
- entry->prev_cookie = entry->cookie;
entry->cookie = new_cookie;
return 0;
-
-out_truncated:
- dprintk("NFS: directory entry contains invalid file handle\n");
- *entry = old;
- return -EAGAIN;
}
/*
@@ -2228,6 +2219,7 @@ static int decode_fsinfo3resok(struct xdr_stream *xdr,
/* ignore properties */
result->lease_time = 0;
result->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED;
+ result->xattr_support = 0;
return 0;
}
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 32129446beca..068c45b3bc1a 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -175,28 +175,27 @@ static int handle_async_copy(struct nfs42_copy_res *res,
nfs4_stateid *src_stateid,
bool *restart)
{
- struct nfs4_copy_state *copy, *tmp_copy;
+ struct nfs4_copy_state *copy, *tmp_copy = NULL, *iter;
int status = NFS4_OK;
- bool found_pending = false;
struct nfs_open_context *dst_ctx = nfs_file_open_context(dst);
struct nfs_open_context *src_ctx = nfs_file_open_context(src);
- copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS);
+ copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_KERNEL);
if (!copy)
return -ENOMEM;
spin_lock(&dst_server->nfs_client->cl_lock);
- list_for_each_entry(tmp_copy,
+ list_for_each_entry(iter,
&dst_server->nfs_client->pending_cb_stateids,
copies) {
- if (memcmp(&res->write_res.stateid, &tmp_copy->stateid,
+ if (memcmp(&res->write_res.stateid, &iter->stateid,
NFS4_STATEID_SIZE))
continue;
- found_pending = true;
- list_del(&tmp_copy->copies);
+ tmp_copy = iter;
+ list_del(&iter->copies);
break;
}
- if (found_pending) {
+ if (tmp_copy) {
spin_unlock(&dst_server->nfs_client->cl_lock);
kfree(copy);
copy = tmp_copy;
@@ -254,7 +253,7 @@ static int process_copy_commit(struct file *dst, loff_t pos_dst,
struct nfs_commitres cres;
int status = -ENOMEM;
- cres.verf = kzalloc(sizeof(struct nfs_writeverf), GFP_NOFS);
+ cres.verf = kzalloc(sizeof(struct nfs_writeverf), GFP_KERNEL);
if (!cres.verf)
goto out;
@@ -357,7 +356,7 @@ static ssize_t _nfs42_proc_copy(struct file *src,
res->commit_res.verf = NULL;
if (args->sync) {
res->commit_res.verf =
- kzalloc(sizeof(struct nfs_writeverf), GFP_NOFS);
+ kzalloc(sizeof(struct nfs_writeverf), GFP_KERNEL);
if (!res->commit_res.verf)
return -ENOMEM;
}
@@ -552,7 +551,7 @@ static int nfs42_do_offload_cancel_async(struct file *dst,
if (!(dst_server->caps & NFS_CAP_OFFLOAD_CANCEL))
return -EOPNOTSUPP;
- data = kzalloc(sizeof(struct nfs42_offloadcancel_data), GFP_NOFS);
+ data = kzalloc(sizeof(struct nfs42_offloadcancel_data), GFP_KERNEL);
if (data == NULL)
return -ENOMEM;
@@ -591,8 +590,10 @@ static int _nfs42_proc_copy_notify(struct file *src, struct file *dst,
ctx = get_nfs_open_context(nfs_file_open_context(src));
l_ctx = nfs_get_lock_context(ctx);
- if (IS_ERR(l_ctx))
- return PTR_ERR(l_ctx);
+ if (IS_ERR(l_ctx)) {
+ status = PTR_ERR(l_ctx);
+ goto out;
+ }
status = nfs4_set_rw_stateid(&args->cna_src_stateid, ctx, l_ctx,
FMODE_READ);
@@ -600,7 +601,7 @@ static int _nfs42_proc_copy_notify(struct file *src, struct file *dst,
if (status) {
if (status == -EAGAIN)
status = -NFS4ERR_BAD_STATEID;
- return status;
+ goto out;
}
status = nfs4_call_sync(src_server->client, src_server, &msg,
@@ -609,6 +610,7 @@ static int _nfs42_proc_copy_notify(struct file *src, struct file *dst,
if (status == -ENOTSUPP)
src_server->caps &= ~NFS_CAP_COPY_NOTIFY;
+out:
put_nfs_open_context(nfs_file_open_context(src));
return status;
}
@@ -626,7 +628,7 @@ int nfs42_proc_copy_notify(struct file *src, struct file *dst,
if (!(src_server->caps & NFS_CAP_COPY_NOTIFY))
return -EOPNOTSUPP;
- args = kzalloc(sizeof(struct nfs42_copy_notify_args), GFP_NOFS);
+ args = kzalloc(sizeof(struct nfs42_copy_notify_args), GFP_KERNEL);
if (args == NULL)
return -ENOMEM;
@@ -1014,7 +1016,7 @@ int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg,
return -EOPNOTSUPP;
if (n > NFS42_LAYOUTERROR_MAX)
return -EINVAL;
- data = nfs42_alloc_layouterror_data(lseg, GFP_NOFS);
+ data = nfs42_alloc_layouterror_data(lseg, nfs_io_gfp_mask());
if (!data)
return -ENOMEM;
for (i = 0; i < n; i++) {
diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c
index 1c4d2a05b401..e7b34f7e0614 100644
--- a/fs/nfs/nfs42xattr.c
+++ b/fs/nfs/nfs42xattr.c
@@ -199,7 +199,7 @@ nfs4_xattr_alloc_entry(const char *name, const void *value,
flags = NFS4_XATTR_ENTRY_EXTVAL;
}
- buf = kmalloc(alloclen, GFP_KERNEL_ACCOUNT | GFP_NOFS);
+ buf = kmalloc(alloclen, GFP_KERNEL);
if (buf == NULL)
return NULL;
entry = (struct nfs4_xattr_entry *)buf;
@@ -213,7 +213,7 @@ nfs4_xattr_alloc_entry(const char *name, const void *value,
if (flags & NFS4_XATTR_ENTRY_EXTVAL) {
- valp = kvmalloc(len, GFP_KERNEL_ACCOUNT | GFP_NOFS);
+ valp = kvmalloc(len, GFP_KERNEL);
if (valp == NULL) {
kfree(buf);
return NULL;
@@ -289,8 +289,7 @@ nfs4_xattr_alloc_cache(void)
{
struct nfs4_xattr_cache *cache;
- cache = kmem_cache_alloc(nfs4_xattr_cache_cachep,
- GFP_KERNEL_ACCOUNT | GFP_NOFS);
+ cache = kmem_cache_alloc(nfs4_xattr_cache_cachep, GFP_KERNEL);
if (cache == NULL)
return NULL;
@@ -998,7 +997,7 @@ int __init nfs4_xattr_cache_init(void)
nfs4_xattr_cache_cachep = kmem_cache_create("nfs4_xattr_cache_cache",
sizeof(struct nfs4_xattr_cache), 0,
- (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+ (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
nfs4_xattr_cache_init_once);
if (nfs4_xattr_cache_cachep == NULL)
return -ENOMEM;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 84f39b6f1b1e..79df6e83881b 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -42,6 +42,7 @@ enum nfs4_client_state {
NFS4CLNT_LEASE_MOVED,
NFS4CLNT_DELEGATION_EXPIRED,
NFS4CLNT_RUN_MANAGER,
+ NFS4CLNT_MANAGER_AVAILABLE,
NFS4CLNT_RECALL_RUNNING,
NFS4CLNT_RECALL_ANY_LAYOUT_READ,
NFS4CLNT_RECALL_ANY_LAYOUT_RW,
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index e79ae4cbc395..7b861e4f0533 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -32,6 +32,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
struct dentry *parent = NULL;
struct inode *dir;
unsigned openflags = filp->f_flags;
+ fmode_t f_mode;
struct iattr attr;
int err;
@@ -50,8 +51,9 @@ nfs4_file_open(struct inode *inode, struct file *filp)
if (err)
return err;
+ f_mode = filp->f_mode;
if ((openflags & O_ACCMODE) == 3)
- return nfs_open(inode, filp);
+ f_mode |= flags_to_mode(openflags);
/* We can't create new files here */
openflags &= ~(O_CREAT|O_EXCL);
@@ -59,7 +61,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
parent = dget_parent(dentry);
dir = d_inode(parent);
- ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode, filp);
+ ctx = alloc_nfs_open_context(file_dentry(filp), f_mode, filp);
err = PTR_ERR(ctx);
if (IS_ERR(ctx))
goto out;
@@ -165,7 +167,7 @@ retry:
if (sync)
return -EOPNOTSUPP;
cn_resp = kzalloc(sizeof(struct nfs42_copy_notify_res),
- GFP_NOFS);
+ GFP_KERNEL);
if (unlikely(cn_resp == NULL))
return -ENOMEM;
@@ -180,8 +182,8 @@ retry:
ret = nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count,
nss, cnrs, sync);
out:
- if (!nfs42_files_from_same_server(file_in, file_out))
- kfree(cn_resp);
+ kfree(cn_resp);
+
if (ret == -EAGAIN)
goto retry;
return ret;
@@ -339,7 +341,7 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt,
res = ERR_PTR(-ENOMEM);
len = strlen(SSC_READ_NAME_BODY) + 16;
- read_name = kzalloc(len, GFP_NOFS);
+ read_name = kzalloc(len, GFP_KERNEL);
if (read_name == NULL)
goto out;
snprintf(read_name, len, SSC_READ_NAME_BODY, read_name_gen++);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0e0db6c27619..16106f805ffa 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1392,13 +1392,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
case NFS4_OPEN_CLAIM_FH:
p->o_arg.access = NFS4_ACCESS_READ | NFS4_ACCESS_MODIFY |
NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE |
- NFS4_ACCESS_EXECUTE;
-#ifdef CONFIG_NFS_V4_2
- if (!(server->caps & NFS_CAP_XATTR))
- break;
- p->o_arg.access |= NFS4_ACCESS_XAREAD | NFS4_ACCESS_XAWRITE |
- NFS4_ACCESS_XALIST;
-#endif
+ NFS4_ACCESS_EXECUTE |
+ nfs_access_xattr_mask(server);
}
p->o_arg.clientid = server->nfs_client->cl_clientid;
p->o_arg.id.create_time = ktime_to_ns(sp->so_seqid.create_time);
@@ -3050,6 +3045,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
if (opendata->o_res.rflags & NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK)
set_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags);
+ if (opendata->o_res.rflags & NFS4_OPEN_RESULT_PRESERVE_UNLINKED)
+ set_bit(NFS_INO_PRESERVE_UNLINKED, &NFS_I(state->inode)->flags);
dentry = opendata->dentry;
if (d_really_is_negative(dentry)) {
@@ -5904,7 +5901,7 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
buflen = server->rsize;
npages = DIV_ROUND_UP(buflen, PAGE_SIZE) + 1;
- pages = kmalloc_array(npages, sizeof(struct page *), GFP_NOFS);
+ pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
if (!pages)
return -ENOMEM;
@@ -6609,7 +6606,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred,
};
int status = 0;
- data = kzalloc(sizeof(*data), GFP_NOFS);
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
if (data == NULL)
return -ENOMEM;
@@ -6797,7 +6794,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
struct nfs4_state *state = lsp->ls_state;
struct inode *inode = state->inode;
- p = kzalloc(sizeof(*p), GFP_NOFS);
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
if (p == NULL)
return NULL;
p->arg.fh = NFS_FH(inode);
@@ -7202,8 +7199,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
task_setup_data.flags |= RPC_TASK_MOVEABLE;
data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file),
- fl->fl_u.nfs4_fl.owner,
- recovery_type == NFS_LOCK_NEW ? GFP_KERNEL : GFP_NOFS);
+ fl->fl_u.nfs4_fl.owner, GFP_KERNEL);
if (data == NULL)
return -ENOMEM;
if (IS_SETLKW(cmd))
@@ -7626,7 +7622,7 @@ nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
if (server->nfs_client->cl_mvops->minor_version != 0)
return;
- data = kmalloc(sizeof(*data), GFP_NOFS);
+ data = kmalloc(sizeof(*data), GFP_KERNEL);
if (!data)
return;
data->lsp = lsp;
@@ -8012,6 +8008,18 @@ static int _nfs41_proc_get_locations(struct nfs_server *server,
.rpc_resp = &res,
.rpc_cred = cred,
};
+ struct nfs4_call_sync_data data = {
+ .seq_server = server,
+ .seq_args = &args.seq_args,
+ .seq_res = &res.seq_res,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = clnt,
+ .rpc_message = &msg,
+ .callback_ops = server->nfs_client->cl_mvops->call_sync_ops,
+ .callback_data = &data,
+ .flags = RPC_TASK_NO_ROUND_ROBIN,
+ };
int status;
nfs_fattr_init(&locations->fattr);
@@ -8019,8 +8027,7 @@ static int _nfs41_proc_get_locations(struct nfs_server *server,
locations->nlocations = 0;
nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 1);
- status = nfs4_call_sync_sequence(clnt, server, &msg,
- &args.seq_args, &res.seq_res);
+ status = nfs4_call_sync_custom(&task_setup_data);
if (status == NFS4_OK &&
res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED)
status = -NFS4ERR_LEASE_MOVED;
@@ -8333,6 +8340,7 @@ nfs4_bind_one_conn_to_session_done(struct rpc_task *task, void *calldata)
case -NFS4ERR_DEADSESSION:
nfs4_schedule_session_recovery(clp->cl_session,
task->tk_status);
+ return;
}
if (args->dir == NFS4_CDFC4_FORE_OR_BOTH &&
res->dir != NFS4_CDFS4_BOTH) {
@@ -9291,7 +9299,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
goto out_err;
ret = ERR_PTR(-ENOMEM);
- calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
+ calldata = kzalloc(sizeof(*calldata), GFP_KERNEL);
if (calldata == NULL)
goto out_put_clp;
nfs4_init_sequence(&calldata->args, &calldata->res, 0, is_privileged);
@@ -9607,6 +9615,8 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout)
nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0, 0);
task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return ERR_CAST(task);
status = rpc_wait_for_completion_task(task);
if (status != 0)
@@ -10222,7 +10232,7 @@ static int nfs41_free_stateid(struct nfs_server *server,
&task_setup.rpc_client, &msg);
dprintk("NFS call free_stateid %p\n", stateid);
- data = kmalloc(sizeof(*data), GFP_NOFS);
+ data = kmalloc(sizeof(*data), GFP_KERNEL);
if (!data)
return -ENOMEM;
data->server = server;
@@ -10461,6 +10471,24 @@ static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
return error + error2 + error3;
}
+static void nfs4_enable_swap(struct inode *inode)
+{
+ /* The state manager thread must always be running.
+ * It will notice the client is a swapper, and stay put.
+ */
+ struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+
+ nfs4_schedule_state_manager(clp);
+}
+
+static void nfs4_disable_swap(struct inode *inode)
+{
+ /* The state manager thread will now exit once it is
+ * woken.
+ */
+ wake_up_var(&NFS_SERVER(inode)->nfs_client->cl_state);
+}
+
static const struct inode_operations nfs4_dir_inode_operations = {
.create = nfs_create,
.lookup = nfs_lookup,
@@ -10538,6 +10566,8 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
.create_server = nfs4_create_server,
.clone_server = nfs_clone_server,
.discover_trunking = nfs4_discover_trunking,
+ .enable_swap = nfs4_enable_swap,
+ .disable_swap = nfs4_disable_swap,
};
static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index f5a62c0d999b..9e1c987c81e7 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -49,6 +49,7 @@
#include <linux/workqueue.h>
#include <linux/bitops.h>
#include <linux/jiffies.h>
+#include <linux/sched/mm.h>
#include <linux/sunrpc/clnt.h>
@@ -666,7 +667,7 @@ nfs4_alloc_open_state(void)
{
struct nfs4_state *state;
- state = kzalloc(sizeof(*state), GFP_NOFS);
+ state = kzalloc(sizeof(*state), GFP_KERNEL_ACCOUNT);
if (!state)
return NULL;
refcount_set(&state->count, 1);
@@ -820,7 +821,7 @@ static void __nfs4_close(struct nfs4_state *state,
void nfs4_close_state(struct nfs4_state *state, fmode_t fmode)
{
- __nfs4_close(state, fmode, GFP_NOFS, 0);
+ __nfs4_close(state, fmode, GFP_KERNEL, 0);
}
void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode)
@@ -869,14 +870,15 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
struct nfs4_lock_state *lsp;
struct nfs_server *server = state->owner->so_server;
- lsp = kzalloc(sizeof(*lsp), GFP_NOFS);
+ lsp = kzalloc(sizeof(*lsp), GFP_KERNEL_ACCOUNT);
if (lsp == NULL)
return NULL;
nfs4_init_seqid_counter(&lsp->ls_seqid);
refcount_set(&lsp->ls_count, 1);
lsp->ls_state = state;
lsp->ls_owner = fl_owner;
- lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, 0, 0, GFP_NOFS);
+ lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id,
+ 0, 0, GFP_KERNEL_ACCOUNT);
if (lsp->ls_seqid.owner_id < 0)
goto out_free;
INIT_LIST_HEAD(&lsp->ls_locks);
@@ -1205,10 +1207,17 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
{
struct task_struct *task;
char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1];
+ struct rpc_clnt *cl = clp->cl_rpcclient;
+
+ while (cl != cl->cl_parent)
+ cl = cl->cl_parent;
set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
- if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
+ if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) != 0) {
+ wake_up_var(&clp->cl_state);
return;
+ }
+ set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
__module_get(THIS_MODULE);
refcount_inc(&clp->cl_count);
@@ -1224,6 +1233,7 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
printk(KERN_ERR "%s: kthread_run: %ld\n",
__func__, PTR_ERR(task));
nfs4_clear_state_manager_bit(clp);
+ clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
nfs_put_client(clp);
module_put(THIS_MODULE);
}
@@ -2560,9 +2570,17 @@ static void nfs4_layoutreturn_any_run(struct nfs_client *clp)
static void nfs4_state_manager(struct nfs_client *clp)
{
+ unsigned int memflags;
int status = 0;
const char *section = "", *section_sep = "";
+ /*
+ * State recovery can deadlock if the direct reclaim code tries
+ * start NFS writeback. So ensure memory allocations are all
+ * GFP_NOFS.
+ */
+ memflags = memalloc_nofs_save();
+
/* Ensure exclusive access to NFSv4 state */
do {
trace_nfs4_state_mgr(clp);
@@ -2657,6 +2675,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
}
+ memalloc_nofs_restore(memflags);
nfs4_end_drain_session(clp);
nfs4_clear_state_manager_bit(clp);
@@ -2669,11 +2688,8 @@ static void nfs4_state_manager(struct nfs_client *clp)
clear_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state);
}
- /* Did we race with an attempt to give us more work? */
- if (!test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state))
- return;
- if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
- return;
+ return;
+
} while (refcount_read(&clp->cl_count) > 1 && !signalled());
goto out_drain;
@@ -2686,6 +2702,7 @@ out_error:
clp->cl_hostname, -status);
ssleep(1);
out_drain:
+ memalloc_nofs_restore(memflags);
nfs4_end_drain_session(clp);
nfs4_clear_state_manager_bit(clp);
}
@@ -2693,10 +2710,31 @@ out_drain:
static int nfs4_run_state_manager(void *ptr)
{
struct nfs_client *clp = ptr;
+ struct rpc_clnt *cl = clp->cl_rpcclient;
+
+ while (cl != cl->cl_parent)
+ cl = cl->cl_parent;
allow_signal(SIGKILL);
+again:
+ set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
nfs4_state_manager(clp);
+ if (atomic_read(&cl->cl_swapper)) {
+ wait_var_event_interruptible(&clp->cl_state,
+ test_bit(NFS4CLNT_RUN_MANAGER,
+ &clp->cl_state));
+ if (atomic_read(&cl->cl_swapper) &&
+ test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state))
+ goto again;
+ /* Either no longer a swapper, or were signalled */
+ }
+ clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
+
+ if (refcount_read(&clp->cl_count) > 1 && !signalled() &&
+ test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) &&
+ !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state))
+ goto again;
+
nfs_put_client(clp);
- module_put_and_kthread_exit(0);
return 0;
}
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 8e70b92df4cc..86a5f6516928 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1605,7 +1605,8 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
FATTR4_WORD0_RDATTR_ERROR,
FATTR4_WORD1_MOUNTED_ON_FILEID,
};
- uint32_t dircount = readdir->count >> 1;
+ uint32_t dircount = readdir->count;
+ uint32_t maxcount = readdir->count;
__be32 *p, verf[2];
uint32_t attrlen = 0;
unsigned int i;
@@ -1618,7 +1619,6 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS|
FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
attrs[2] |= FATTR4_WORD2_SECURITY_LABEL;
- dircount >>= 1;
}
/* Use mounted_on_fileid only if the server supports it */
if (!(readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID))
@@ -1634,7 +1634,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
encode_nfs4_verifier(xdr, &readdir->verifier);
p = reserve_space(xdr, 12 + (attrlen << 2));
*p++ = cpu_to_be32(dircount);
- *p++ = cpu_to_be32(readdir->count);
+ *p++ = cpu_to_be32(maxcount);
*p++ = cpu_to_be32(attrlen);
for (i = 0; i < attrlen; i++)
*p++ = cpu_to_be32(attrs[i]);
@@ -7508,7 +7508,6 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE)
entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
- entry->prev_cookie = entry->cookie;
entry->cookie = new_cookie;
return 0;
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 317ce27bdc4b..012bd7339862 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -21,7 +21,6 @@
{ NFS_INO_INVALID_ATIME, "INVALID_ATIME" }, \
{ NFS_INO_INVALID_ACCESS, "INVALID_ACCESS" }, \
{ NFS_INO_INVALID_ACL, "INVALID_ACL" }, \
- { NFS_INO_REVAL_PAGECACHE, "REVAL_PAGECACHE" }, \
{ NFS_INO_REVAL_FORCED, "REVAL_FORCED" }, \
{ NFS_INO_INVALID_LABEL, "INVALID_LABEL" }, \
{ NFS_INO_INVALID_CHANGE, "INVALID_CHANGE" }, \
@@ -37,7 +36,6 @@
#define nfs_show_nfsi_flags(v) \
__print_flags(v, "|", \
- { BIT(NFS_INO_ADVISE_RDPLUS), "ADVISE_RDPLUS" }, \
{ BIT(NFS_INO_STALE), "STALE" }, \
{ BIT(NFS_INO_ACL_LRU_SET), "ACL_LRU_SET" }, \
{ BIT(NFS_INO_INVALIDATING), "INVALIDATING" }, \
@@ -162,6 +160,9 @@ DEFINE_NFS_INODE_EVENT(nfs_fsync_enter);
DEFINE_NFS_INODE_EVENT_DONE(nfs_fsync_exit);
DEFINE_NFS_INODE_EVENT(nfs_access_enter);
DEFINE_NFS_INODE_EVENT_DONE(nfs_set_cache_invalid);
+DEFINE_NFS_INODE_EVENT(nfs_readdir_force_readdirplus);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_readdir_cache_fill_done);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_readdir_uncached_done);
TRACE_EVENT(nfs_access_exit,
TP_PROTO(
@@ -273,6 +274,122 @@ DEFINE_NFS_UPDATE_SIZE_EVENT(wcc);
DEFINE_NFS_UPDATE_SIZE_EVENT(update);
DEFINE_NFS_UPDATE_SIZE_EVENT(grow);
+DECLARE_EVENT_CLASS(nfs_inode_range_event,
+ TP_PROTO(
+ const struct inode *inode,
+ loff_t range_start,
+ loff_t range_end
+ ),
+
+ TP_ARGS(inode, range_start, range_end),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(u64, version)
+ __field(loff_t, range_start)
+ __field(loff_t, range_end)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->fileid = nfsi->fileid;
+ __entry->version = inode_peek_iversion_raw(inode);
+ __entry->range_start = range_start;
+ __entry->range_end = range_end;
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu "
+ "range=[%lld, %lld]",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __entry->version,
+ __entry->range_start, __entry->range_end
+ )
+);
+
+#define DEFINE_NFS_INODE_RANGE_EVENT(name) \
+ DEFINE_EVENT(nfs_inode_range_event, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ loff_t range_start, \
+ loff_t range_end \
+ ), \
+ TP_ARGS(inode, range_start, range_end))
+
+DEFINE_NFS_INODE_RANGE_EVENT(nfs_readdir_invalidate_cache_range);
+
+DECLARE_EVENT_CLASS(nfs_readdir_event,
+ TP_PROTO(
+ const struct file *file,
+ const __be32 *verifier,
+ u64 cookie,
+ pgoff_t page_index,
+ unsigned int dtsize
+ ),
+
+ TP_ARGS(file, verifier, cookie, page_index, dtsize),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(u64, version)
+ __array(char, verifier, NFS4_VERIFIER_SIZE)
+ __field(u64, cookie)
+ __field(pgoff_t, index)
+ __field(unsigned int, dtsize)
+ ),
+
+ TP_fast_assign(
+ const struct inode *dir = file_inode(file);
+ const struct nfs_inode *nfsi = NFS_I(dir);
+
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->version = inode_peek_iversion_raw(dir);
+ if (cookie != 0)
+ memcpy(__entry->verifier, verifier,
+ NFS4_VERIFIER_SIZE);
+ else
+ memset(__entry->verifier, 0,
+ NFS4_VERIFIER_SIZE);
+ __entry->cookie = cookie;
+ __entry->index = page_index;
+ __entry->dtsize = dtsize;
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu "
+ "cookie=%s:0x%llx cache_index=%lu dtsize=%u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid, __entry->fhandle,
+ __entry->version, show_nfs4_verifier(__entry->verifier),
+ (unsigned long long)__entry->cookie, __entry->index,
+ __entry->dtsize
+ )
+);
+
+#define DEFINE_NFS_READDIR_EVENT(name) \
+ DEFINE_EVENT(nfs_readdir_event, name, \
+ TP_PROTO( \
+ const struct file *file, \
+ const __be32 *verifier, \
+ u64 cookie, \
+ pgoff_t page_index, \
+ unsigned int dtsize \
+ ), \
+ TP_ARGS(file, verifier, cookie, page_index, dtsize))
+
+DEFINE_NFS_READDIR_EVENT(nfs_readdir_cache_fill);
+DEFINE_NFS_READDIR_EVENT(nfs_readdir_uncached);
+
DECLARE_EVENT_CLASS(nfs_lookup_event,
TP_PROTO(
const struct inode *dir,
@@ -366,6 +483,9 @@ DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_enter);
DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_exit);
DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_revalidate_enter);
DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_revalidate_exit);
+DEFINE_NFS_LOOKUP_EVENT(nfs_readdir_lookup);
+DEFINE_NFS_LOOKUP_EVENT(nfs_readdir_lookup_revalidate_failed);
+DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_readdir_lookup_revalidate);
TRACE_EVENT(nfs_atomic_open_enter,
TP_PROTO(
@@ -889,11 +1009,11 @@ TRACE_EVENT(nfs_aop_readpage_done,
TRACE_EVENT(nfs_aop_readahead,
TP_PROTO(
const struct inode *inode,
- struct page *page,
+ loff_t pos,
unsigned int nr_pages
),
- TP_ARGS(inode, page, nr_pages),
+ TP_ARGS(inode, pos, nr_pages),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -911,7 +1031,7 @@ TRACE_EVENT(nfs_aop_readahead,
__entry->fileid = nfsi->fileid;
__entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
__entry->version = inode_peek_iversion_raw(inode);
- __entry->offset = page_index(page) << PAGE_SHIFT;
+ __entry->offset = pos;
__entry->nr_pages = nr_pages;
),
@@ -1095,6 +1215,97 @@ TRACE_EVENT(nfs_readpage_short,
)
);
+DECLARE_EVENT_CLASS(nfs_fscache_page_event,
+ TP_PROTO(
+ const struct inode *inode,
+ struct page *page
+ ),
+
+ TP_ARGS(inode, page),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = &nfsi->fh;
+
+ __entry->offset = page_index(page) << PAGE_SHIFT;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ (long long)__entry->offset
+ )
+);
+DECLARE_EVENT_CLASS(nfs_fscache_page_event_done,
+ TP_PROTO(
+ const struct inode *inode,
+ struct page *page,
+ int error
+ ),
+
+ TP_ARGS(inode, page, error),
+
+ TP_STRUCT__entry(
+ __field(int, error)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = &nfsi->fh;
+
+ __entry->offset = page_index(page) << PAGE_SHIFT;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ __entry->error = error;
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld error=%d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ (long long)__entry->offset, __entry->error
+ )
+);
+#define DEFINE_NFS_FSCACHE_PAGE_EVENT(name) \
+ DEFINE_EVENT(nfs_fscache_page_event, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ struct page *page \
+ ), \
+ TP_ARGS(inode, page))
+#define DEFINE_NFS_FSCACHE_PAGE_EVENT_DONE(name) \
+ DEFINE_EVENT(nfs_fscache_page_event_done, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ struct page *page, \
+ int error \
+ ), \
+ TP_ARGS(inode, page, error))
+DEFINE_NFS_FSCACHE_PAGE_EVENT(nfs_fscache_read_page);
+DEFINE_NFS_FSCACHE_PAGE_EVENT_DONE(nfs_fscache_read_page_exit);
+DEFINE_NFS_FSCACHE_PAGE_EVENT(nfs_fscache_write_page);
+DEFINE_NFS_FSCACHE_PAGE_EVENT_DONE(nfs_fscache_write_page_exit);
+
TRACE_EVENT(nfs_pgio_error,
TP_PROTO(
const struct nfs_pgio_header *hdr,
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index ad7f83dc9a2d..9157dd19b8b4 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -90,10 +90,10 @@ void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos)
}
}
-static inline struct nfs_page *
-nfs_page_alloc(void)
+static inline struct nfs_page *nfs_page_alloc(void)
{
- struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_KERNEL);
+ struct nfs_page *p =
+ kmem_cache_zalloc(nfs_page_cachep, nfs_io_gfp_mask());
if (p)
INIT_LIST_HEAD(&p->wb_list);
return p;
@@ -892,7 +892,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
struct nfs_commit_info cinfo;
struct nfs_page_array *pg_array = &hdr->page_array;
unsigned int pagecount, pageused;
- gfp_t gfp_flags = GFP_KERNEL;
+ gfp_t gfp_flags = nfs_io_gfp_mask();
pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count);
pg_array->npages = pagecount;
@@ -979,7 +979,7 @@ nfs_pageio_alloc_mirrors(struct nfs_pageio_descriptor *desc,
desc->pg_mirrors_dynamic = NULL;
if (mirror_count == 1)
return desc->pg_mirrors_static;
- ret = kmalloc_array(mirror_count, sizeof(*ret), GFP_KERNEL);
+ ret = kmalloc_array(mirror_count, sizeof(*ret), nfs_io_gfp_mask());
if (ret != NULL) {
for (i = 0; i < mirror_count; i++)
nfs_pageio_mirror_init(&ret[i], desc->pg_bsize);
@@ -1218,6 +1218,7 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
do {
list_splice_init(&mirror->pg_list, &head);
+ mirror->pg_recoalesce = 0;
while (!list_empty(&head)) {
struct nfs_page *req;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 7c9090a28e5c..856c962273c7 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -92,6 +92,17 @@ find_pnfs_driver(u32 id)
return local;
}
+const struct pnfs_layoutdriver_type *pnfs_find_layoutdriver(u32 id)
+{
+ return find_pnfs_driver(id);
+}
+
+void pnfs_put_layoutdriver(const struct pnfs_layoutdriver_type *ld)
+{
+ if (ld)
+ module_put(ld->owner);
+}
+
void
unset_pnfs_layoutdriver(struct nfs_server *nfss)
{
@@ -1233,7 +1244,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo,
int status = 0;
*pcred = NULL;
- lrp = kzalloc(sizeof(*lrp), GFP_NOFS);
+ lrp = kzalloc(sizeof(*lrp), nfs_io_gfp_mask());
if (unlikely(lrp == NULL)) {
status = -ENOMEM;
spin_lock(&ino->i_lock);
@@ -2206,7 +2217,7 @@ _pnfs_grab_empty_layout(struct inode *ino, struct nfs_open_context *ctx)
struct pnfs_layout_hdr *lo;
spin_lock(&ino->i_lock);
- lo = pnfs_find_alloc_layout(ino, ctx, GFP_KERNEL);
+ lo = pnfs_find_alloc_layout(ino, ctx, nfs_io_gfp_mask());
if (!lo)
goto out_unlock;
if (!test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags))
@@ -2249,8 +2260,8 @@ static void _lgopen_prepare_attached(struct nfs4_opendata *data,
lo = _pnfs_grab_empty_layout(ino, ctx);
if (!lo)
return;
- lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &current_stateid,
- &rng, GFP_KERNEL);
+ lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &current_stateid, &rng,
+ nfs_io_gfp_mask());
if (!lgp) {
pnfs_clear_first_layoutget(lo);
nfs_layoutget_end(lo);
@@ -2275,8 +2286,8 @@ static void _lgopen_prepare_floating(struct nfs4_opendata *data,
};
struct nfs4_layoutget *lgp;
- lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &current_stateid,
- &rng, GFP_KERNEL);
+ lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &current_stateid, &rng,
+ nfs_io_gfp_mask());
if (!lgp)
return;
data->lgp = lgp;
@@ -2691,13 +2702,11 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
else
rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
- pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
- nfs_req_openctx(req),
- req_offset(req),
- rd_size,
- IOMODE_READ,
- false,
- GFP_KERNEL);
+ pgio->pg_lseg =
+ pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
+ req_offset(req), rd_size,
+ IOMODE_READ, false,
+ nfs_io_gfp_mask());
if (IS_ERR(pgio->pg_lseg)) {
pgio->pg_error = PTR_ERR(pgio->pg_lseg);
pgio->pg_lseg = NULL;
@@ -2718,13 +2727,10 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
pnfs_generic_pg_check_layout(pgio);
pnfs_generic_pg_check_range(pgio, req);
if (pgio->pg_lseg == NULL) {
- pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
- nfs_req_openctx(req),
- req_offset(req),
- wb_size,
- IOMODE_RW,
- false,
- GFP_KERNEL);
+ pgio->pg_lseg =
+ pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
+ req_offset(req), wb_size, IOMODE_RW,
+ false, nfs_io_gfp_mask());
if (IS_ERR(pgio->pg_lseg)) {
pgio->pg_error = PTR_ERR(pgio->pg_lseg);
pgio->pg_lseg = NULL;
@@ -3183,7 +3189,7 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
status = -ENOMEM;
/* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
- data = kzalloc(sizeof(*data), GFP_NOFS);
+ data = kzalloc(sizeof(*data), nfs_io_gfp_mask());
if (!data)
goto clear_layoutcommitting;
@@ -3250,7 +3256,7 @@ struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
{
struct nfs4_threshold *thp;
- thp = kzalloc(sizeof(*thp), GFP_NOFS);
+ thp = kzalloc(sizeof(*thp), nfs_io_gfp_mask());
if (!thp) {
dprintk("%s mdsthreshold allocation failed\n", __func__);
return NULL;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index f4d7548d67b2..07f11489e4e9 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -234,6 +234,8 @@ struct pnfs_devicelist {
extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
+extern const struct pnfs_layoutdriver_type *pnfs_find_layoutdriver(u32 id);
+extern void pnfs_put_layoutdriver(const struct pnfs_layoutdriver_type *ld);
/* nfs4proc.c */
extern size_t max_response_pages(struct nfs_server *server);
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 316f68f96e57..657c242a18ff 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -419,7 +419,7 @@ static struct nfs_commit_data *
pnfs_bucket_fetch_commitdata(struct pnfs_commit_bucket *bucket,
struct nfs_commit_info *cinfo)
{
- struct nfs_commit_data *data = nfs_commitdata_alloc(false);
+ struct nfs_commit_data *data = nfs_commitdata_alloc();
if (!data)
return NULL;
@@ -515,7 +515,11 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
unsigned int nreq = 0;
if (!list_empty(mds_pages)) {
- data = nfs_commitdata_alloc(true);
+ data = nfs_commitdata_alloc();
+ if (!data) {
+ nfs_retry_commit(mds_pages, NULL, cinfo, -1);
+ return -ENOMEM;
+ }
data->ds_commit_index = -1;
list_splice_init(mds_pages, &data->pages);
list_add_tail(&data->list, &list);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 73dcaa99fa9b..e3570c656b0f 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -92,6 +92,7 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
info->maxfilesize = 0x7FFFFFFF;
info->lease_time = 0;
info->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED;
+ info->xattr_support = 0;
return 0;
}
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index eb00229c1a50..5e7657374bc3 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -123,7 +123,7 @@ static void nfs_readpage_release(struct nfs_page *req, int error)
struct address_space *mapping = page_file_mapping(page);
if (PageUptodate(page))
- nfs_readpage_to_fscache(inode, page);
+ nfs_fscache_write_page(inode, page);
else if (!PageError(page) && !PagePrivate(page))
generic_error_remove_page(mapping, page);
unlock_page(page);
@@ -194,10 +194,6 @@ static void nfs_initiate_read(struct nfs_pgio_header *hdr,
const struct nfs_rpc_ops *rpc_ops,
struct rpc_task_setup *task_setup_data, int how)
{
- struct inode *inode = hdr->inode;
- int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
-
- task_setup_data->flags |= swap_flags;
rpc_ops->read_setup(hdr, msg);
trace_nfs_initiate_read(hdr);
}
@@ -290,9 +286,8 @@ static void nfs_readpage_result(struct rpc_task *task,
}
static int
-readpage_async_filler(void *data, struct page *page)
+readpage_async_filler(struct nfs_readdesc *desc, struct page *page)
{
- struct nfs_readdesc *desc = data;
struct inode *inode = page_file_mapping(page)->host;
unsigned int rsize = NFS_SERVER(inode)->rsize;
struct nfs_page *new;
@@ -306,7 +301,7 @@ readpage_async_filler(void *data, struct page *page)
aligned_len = min_t(unsigned int, ALIGN(len, rsize), PAGE_SIZE);
if (!IS_SYNC(page->mapping->host)) {
- error = nfs_readpage_from_fscache(page->mapping->host, page);
+ error = nfs_fscache_read_page(page->mapping->host, page);
if (error == 0)
goto out_unlock;
}
@@ -397,14 +392,16 @@ out_unlock:
return ret;
}
-int nfs_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+void nfs_readahead(struct readahead_control *ractl)
{
+ unsigned int nr_pages = readahead_count(ractl);
+ struct file *file = ractl->file;
struct nfs_readdesc desc;
- struct inode *inode = mapping->host;
+ struct inode *inode = ractl->mapping->host;
+ struct page *page;
int ret;
- trace_nfs_aop_readahead(inode, lru_to_page(pages), nr_pages);
+ trace_nfs_aop_readahead(inode, readahead_pos(ractl), nr_pages);
nfs_inc_stats(inode, NFSIOS_VFSREADPAGES);
ret = -ESTALE;
@@ -422,14 +419,18 @@ int nfs_readpages(struct file *file, struct address_space *mapping,
nfs_pageio_init_read(&desc.pgio, inode, false,
&nfs_async_read_completion_ops);
- ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
+ while ((page = readahead_page(ractl)) != NULL) {
+ ret = readpage_async_filler(&desc, page);
+ put_page(page);
+ if (ret)
+ break;
+ }
nfs_pageio_complete_read(&desc.pgio);
put_nfs_open_context(desc.ctx);
out:
trace_nfs_aop_readahead_done(inode, nr_pages, ret);
- return ret;
}
int __init nfs_init_readpagecache(void)
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 5fa11e1aca4c..6f325e10056c 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -347,6 +347,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
data = kzalloc(sizeof(*data), GFP_KERNEL);
if (data == NULL)
return ERR_PTR(-ENOMEM);
+ task_setup_data.task = &data->task;
task_setup_data.callback_data = data;
data->cred = get_current_cred();
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 987a187bd39a..f00d45cf80ef 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -70,27 +70,17 @@ static mempool_t *nfs_wdata_mempool;
static struct kmem_cache *nfs_cdata_cachep;
static mempool_t *nfs_commit_mempool;
-struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail)
+struct nfs_commit_data *nfs_commitdata_alloc(void)
{
struct nfs_commit_data *p;
- if (never_fail)
- p = mempool_alloc(nfs_commit_mempool, GFP_NOIO);
- else {
- /* It is OK to do some reclaim, not no safe to wait
- * for anything to be returned to the pool.
- * mempool_alloc() cannot handle that particular combination,
- * so we need two separate attempts.
- */
+ p = kmem_cache_zalloc(nfs_cdata_cachep, nfs_io_gfp_mask());
+ if (!p) {
p = mempool_alloc(nfs_commit_mempool, GFP_NOWAIT);
if (!p)
- p = kmem_cache_alloc(nfs_cdata_cachep, GFP_NOIO |
- __GFP_NOWARN | __GFP_NORETRY);
- if (!p)
return NULL;
+ memset(p, 0, sizeof(*p));
}
-
- memset(p, 0, sizeof(*p));
INIT_LIST_HEAD(&p->pages);
return p;
}
@@ -104,9 +94,15 @@ EXPORT_SYMBOL_GPL(nfs_commit_free);
static struct nfs_pgio_header *nfs_writehdr_alloc(void)
{
- struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_KERNEL);
+ struct nfs_pgio_header *p;
- memset(p, 0, sizeof(*p));
+ p = kmem_cache_zalloc(nfs_wdata_cachep, nfs_io_gfp_mask());
+ if (!p) {
+ p = mempool_alloc(nfs_wdata_mempool, GFP_NOWAIT);
+ if (!p)
+ return NULL;
+ memset(p, 0, sizeof(*p));
+ }
p->rw_mode = FMODE_WRITE;
return p;
}
@@ -306,7 +302,7 @@ static void nfs_set_pageerror(struct address_space *mapping)
/* Force file size revalidation */
spin_lock(&inode->i_lock);
nfs_set_cache_invalid(inode, NFS_INO_REVAL_FORCED |
- NFS_INO_REVAL_PAGECACHE |
+ NFS_INO_INVALID_CHANGE |
NFS_INO_INVALID_SIZE);
spin_unlock(&inode->i_lock);
}
@@ -316,7 +312,10 @@ static void nfs_mapping_set_error(struct page *page, int error)
struct address_space *mapping = page_file_mapping(page);
SetPageError(page);
- mapping_set_error(mapping, error);
+ filemap_set_wb_err(mapping, error);
+ if (mapping->host)
+ errseq_set(&mapping->host->i_sb->s_wb_err,
+ error == -ENOSPC ? -ENOSPC : -EIO);
nfs_set_pageerror(mapping);
}
@@ -417,7 +416,7 @@ static void nfs_set_page_writeback(struct page *page)
if (atomic_long_inc_return(&nfss->writeback) >
NFS_CONGESTION_ON_THRESH)
- set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
+ nfss->write_congested = 1;
}
static void nfs_end_page_writeback(struct nfs_page *req)
@@ -433,7 +432,7 @@ static void nfs_end_page_writeback(struct nfs_page *req)
end_page_writeback(req->wb_page);
if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
- clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
+ nfss->write_congested = 0;
}
/*
@@ -672,6 +671,10 @@ static int nfs_writepage_locked(struct page *page,
struct inode *inode = page_file_mapping(page)->host;
int err;
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ NFS_SERVER(inode)->write_congested)
+ return AOP_WRITEPAGE_ACTIVATE;
+
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
nfs_pageio_init_write(&pgio, inode, 0,
false, &nfs_async_write_completion_ops);
@@ -719,6 +722,10 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
int priority = 0;
int err;
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ NFS_SERVER(inode)->write_congested)
+ return 0;
+
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
if (!(mntflags & NFS_MOUNT_WRITE_EAGER) || wbc->for_kupdate ||
@@ -1409,6 +1416,8 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr,
{
int priority = flush_task_priority(how);
+ if (IS_SWAPFILE(hdr->inode))
+ task_setup_data->flags |= RPC_TASK_SWAPPER;
task_setup_data->priority = priority;
rpc_ops->write_setup(hdr, msg, &task_setup_data->rpc_client);
trace_nfs_initiate_write(hdr);
@@ -1821,7 +1830,11 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
if (list_empty(head))
return 0;
- data = nfs_commitdata_alloc(true);
+ data = nfs_commitdata_alloc();
+ if (!data) {
+ nfs_retry_commit(head, NULL, cinfo, -1);
+ return -ENOMEM;
+ }
/* Set up the argument struct */
nfs_init_commit(data, head, NULL, cinfo);
@@ -1893,7 +1906,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
}
nfss = NFS_SERVER(data->inode);
if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
- clear_bdi_congested(inode_to_bdi(data->inode), BLK_RW_ASYNC);
+ nfss->write_congested = 0;
nfs_init_cinfo(&cinfo, data->inode, data->dreq);
nfs_commit_end(cinfo.mds);
@@ -2049,21 +2062,21 @@ out:
}
EXPORT_SYMBOL_GPL(nfs_wb_all);
-int nfs_wb_page_cancel(struct inode *inode, struct page *page)
+int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio)
{
struct nfs_page *req;
int ret = 0;
- wait_on_page_writeback(page);
+ folio_wait_writeback(folio);
/* blocking call to cancel all requests and join to a single (head)
* request */
- req = nfs_lock_and_join_requests(page);
+ req = nfs_lock_and_join_requests(&folio->page);
if (IS_ERR(req)) {
ret = PTR_ERR(req);
} else if (req) {
- /* all requests from this page have been cancelled by
+ /* all requests from this folio have been cancelled by
* nfs_lock_and_join_requests, so just remove the head
* request from the inode / page_private pointer and
* release it */
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 3d1d17256a91..f6a2fd3015e7 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -35,18 +35,9 @@ config NFSD_V2_ACL
bool
depends on NFSD
-config NFSD_V3
- bool "NFS server support for NFS version 3"
- depends on NFSD
- help
- This option enables support in your system's NFS server for
- version 3 of the NFS protocol (RFC 1813).
-
- If unsure, say Y.
-
config NFSD_V3_ACL
bool "NFS server support for the NFSv3 ACL protocol extension"
- depends on NFSD_V3
+ depends on NFSD
select NFSD_V2_ACL
help
Solaris NFS servers support an auxiliary NFSv3 ACL protocol that
@@ -70,7 +61,6 @@ config NFSD_V3_ACL
config NFSD_V4
bool "NFS server support for NFS version 4"
depends on NFSD && PROC_FS
- select NFSD_V3
select FS_POSIX_ACL
select SUNRPC_GSS
select CRYPTO
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 3f0983e93a99..805c06d5f1b4 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -12,9 +12,8 @@ nfsd-y += trace.o
nfsd-y += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
export.o auth.o lockd.o nfscache.o nfsxdr.o \
- stats.o filecache.o
+ stats.o filecache.o nfs3proc.o nfs3xdr.o
nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
-nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o
nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
nfs4acl.o nfs4callback.o nfs4recover.o
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index e5c0982a381d..b6d01d51a746 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -4,7 +4,6 @@
*/
#include <linux/exportfs.h>
#include <linux/iomap.h>
-#include <linux/genhd.h>
#include <linux/slab.h>
#include <linux/pr.h>
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 8bc807c5fea4..2c1b027774d4 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -7,6 +7,7 @@
#include <linux/hash.h>
#include <linux/slab.h>
#include <linux/file.h>
+#include <linux/pagemap.h>
#include <linux/sched.h>
#include <linux/list_lru.h>
#include <linux/fsnotify_backend.h>
@@ -236,6 +237,13 @@ nfsd_file_check_write_error(struct nfsd_file *nf)
}
static void
+nfsd_file_flush(struct nfsd_file *nf)
+{
+ if (nf->nf_file && vfs_fsync(nf->nf_file, 1) != 0)
+ nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
+}
+
+static void
nfsd_file_do_unhash(struct nfsd_file *nf)
{
lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
@@ -294,19 +302,15 @@ nfsd_file_put_noref(struct nfsd_file *nf)
void
nfsd_file_put(struct nfsd_file *nf)
{
- bool is_hashed;
-
set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
- if (refcount_read(&nf->nf_ref) > 2 || !nf->nf_file) {
+ if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) {
+ nfsd_file_flush(nf);
nfsd_file_put_noref(nf);
- return;
+ } else {
+ nfsd_file_put_noref(nf);
+ if (nf->nf_file)
+ nfsd_file_schedule_laundrette();
}
-
- filemap_flush(nf->nf_file->f_mapping);
- is_hashed = test_bit(NFSD_FILE_HASHED, &nf->nf_flags) != 0;
- nfsd_file_put_noref(nf);
- if (is_hashed)
- nfsd_file_schedule_laundrette();
if (atomic_long_read(&nfsd_filecache_count) >= NFSD_FILE_LRU_LIMIT)
nfsd_file_gc();
}
@@ -327,6 +331,7 @@ nfsd_file_dispose_list(struct list_head *dispose)
while(!list_empty(dispose)) {
nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
list_del(&nf->nf_lru);
+ nfsd_file_flush(nf);
nfsd_file_put_noref(nf);
}
}
@@ -340,6 +345,7 @@ nfsd_file_dispose_list_sync(struct list_head *dispose)
while(!list_empty(dispose)) {
nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
list_del(&nf->nf_lru);
+ nfsd_file_flush(nf);
if (!refcount_dec_and_test(&nf->nf_ref))
continue;
if (nfsd_file_free(nf))
@@ -632,7 +638,7 @@ nfsd_file_cache_init(void)
if (!nfsd_filecache_wq)
goto out;
- nfsd_file_hashtbl = kcalloc(NFSD_FILE_HASH_SIZE,
+ nfsd_file_hashtbl = kvcalloc(NFSD_FILE_HASH_SIZE,
sizeof(*nfsd_file_hashtbl), GFP_KERNEL);
if (!nfsd_file_hashtbl) {
pr_err("nfsd: unable to allocate nfsd_file_hashtbl\n");
@@ -700,7 +706,7 @@ out_err:
nfsd_file_slab = NULL;
kmem_cache_destroy(nfsd_file_mark_slab);
nfsd_file_mark_slab = NULL;
- kfree(nfsd_file_hashtbl);
+ kvfree(nfsd_file_hashtbl);
nfsd_file_hashtbl = NULL;
destroy_workqueue(nfsd_filecache_wq);
nfsd_filecache_wq = NULL;
@@ -811,7 +817,7 @@ nfsd_file_cache_shutdown(void)
fsnotify_wait_marks_destroyed();
kmem_cache_destroy(nfsd_file_mark_slab);
nfsd_file_mark_slab = NULL;
- kfree(nfsd_file_hashtbl);
+ kvfree(nfsd_file_hashtbl);
nfsd_file_hashtbl = NULL;
destroy_workqueue(nfsd_filecache_wq);
nfsd_filecache_wq = NULL;
diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c
index 2e2f1d5e9f62..070f90ed09b6 100644
--- a/fs/nfsd/flexfilelayout.c
+++ b/fs/nfsd/flexfilelayout.c
@@ -117,7 +117,7 @@ nfsd4_ff_proc_getdeviceinfo(struct super_block *sb, struct svc_rqst *rqstp,
da->netaddr.addr_len =
snprintf(da->netaddr.addr, FF_ADDR_LEN + 1,
- "%s.%hhu.%hhu", addr, port >> 8, port & 0xff);
+ "%s.%d.%d", addr, port >> 8, port & 0xff);
da->tightly_coupled = false;
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 367551bddfc6..b5760801d377 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -249,34 +249,34 @@ nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
int w;
if (!svcxdr_encode_stat(xdr, resp->status))
- return 0;
+ return false;
if (dentry == NULL || d_really_is_negative(dentry))
- return 1;
+ return true;
inode = d_inode(dentry);
if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat))
- return 0;
+ return false;
if (xdr_stream_encode_u32(xdr, resp->mask) < 0)
- return 0;
+ return false;
rqstp->rq_res.page_len = w = nfsacl_size(
(resp->mask & NFS_ACL) ? resp->acl_access : NULL,
(resp->mask & NFS_DFACL) ? resp->acl_default : NULL);
while (w > 0) {
if (!*(rqstp->rq_next_page++))
- return 1;
+ return true;
w -= PAGE_SIZE;
}
if (!nfs_stream_encode_acl(xdr, inode, resp->acl_access,
resp->mask & NFS_ACL, 0))
- return 0;
+ return false;
if (!nfs_stream_encode_acl(xdr, inode, resp->acl_default,
resp->mask & NFS_DFACL, NFS_ACL_DEFAULT))
- return 0;
+ return false;
- return 1;
+ return true;
}
/* ACCESS */
@@ -286,17 +286,17 @@ nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
struct nfsd3_accessres *resp = rqstp->rq_resp;
if (!svcxdr_encode_stat(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat))
- return 0;
+ return false;
if (xdr_stream_encode_u32(xdr, resp->access) < 0)
- return 0;
+ return false;
break;
}
- return 1;
+ return true;
}
/*
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 6d1b5bb051c5..2c05692a9abf 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -422,7 +422,7 @@ nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls)
new = kmem_cache_alloc(nfs4_layout_cache, GFP_KERNEL);
if (!new)
return nfserr_jukebox;
- memcpy(&new->lo_seg, seg, sizeof(lp->lo_seg));
+ memcpy(&new->lo_seg, seg, sizeof(new->lo_seg));
new->lo_state = ls;
spin_lock(&fp->fi_lock);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 32063733443d..234e852fcdfa 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4711,6 +4711,14 @@ nfsd_break_deleg_cb(struct file_lock *fl)
return ret;
}
+/**
+ * nfsd_breaker_owns_lease - Check if lease conflict was resolved
+ * @fl: Lock state to check
+ *
+ * Return values:
+ * %true: Lease conflict was resolved
+ * %false: Lease conflict was not resolved.
+ */
static bool nfsd_breaker_owns_lease(struct file_lock *fl)
{
struct nfs4_delegation *dl = fl->fl_owner;
@@ -4718,11 +4726,11 @@ static bool nfsd_breaker_owns_lease(struct file_lock *fl)
struct nfs4_client *clp;
if (!i_am_nfsd())
- return NULL;
+ return false;
rqst = kthread_data(current);
/* Note rq_prog == NFS_ACL_PROGRAM is also possible: */
if (rqst->rq_prog != NFS_PROGRAM || rqst->rq_vers < 4)
- return NULL;
+ return false;
clp = *(rqst->rq_lease_breaker);
return dl->dl_stid.sc_client == clp;
}
@@ -6526,7 +6534,7 @@ nfs4_transform_lock_offset(struct file_lock *lock)
}
static fl_owner_t
-nfsd4_fl_get_owner(fl_owner_t owner)
+nfsd4_lm_get_owner(fl_owner_t owner)
{
struct nfs4_lockowner *lo = (struct nfs4_lockowner *)owner;
@@ -6535,7 +6543,7 @@ nfsd4_fl_get_owner(fl_owner_t owner)
}
static void
-nfsd4_fl_put_owner(fl_owner_t owner)
+nfsd4_lm_put_owner(fl_owner_t owner)
{
struct nfs4_lockowner *lo = (struct nfs4_lockowner *)owner;
@@ -6570,8 +6578,8 @@ nfsd4_lm_notify(struct file_lock *fl)
static const struct lock_manager_operations nfsd_posix_mng_ops = {
.lm_notify = nfsd4_lm_notify,
- .lm_get_owner = nfsd4_fl_get_owner,
- .lm_put_owner = nfsd4_fl_put_owner,
+ .lm_get_owner = nfsd4_lm_get_owner,
+ .lm_put_owner = nfsd4_lm_put_owner,
};
static inline void
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 714a3a3bd50c..da92e7d2ab6a 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2854,6 +2854,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
err = vfs_getattr(&path, &stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
if (err)
goto out_nfserr;
+ if (!(stat.result_mask & STATX_BTIME))
+ /* underlying FS does not offer btime so we can't share it */
+ bmval1 &= ~FATTR4_WORD1_TIME_CREATE;
if ((bmval0 & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE |
FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_MAXNAME)) ||
(bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
@@ -3254,6 +3257,13 @@ out_acl:
p = xdr_encode_hyper(p, (s64)stat.mtime.tv_sec);
*p++ = cpu_to_be32(stat.mtime.tv_nsec);
}
+ if (bmval1 & FATTR4_WORD1_TIME_CREATE) {
+ p = xdr_reserve_space(xdr, 12);
+ if (!p)
+ goto out_resource;
+ p = xdr_encode_hyper(p, (s64)stat.btime.tv_sec);
+ *p++ = cpu_to_be32(stat.btime.tv_nsec);
+ }
if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
struct kstat parent_stat;
u64 ino = stat.ino;
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index a4a69ab6ab28..0b3f12aa37ff 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -84,12 +84,6 @@ nfsd_hashsize(unsigned int limit)
return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE);
}
-static u32
-nfsd_cache_hash(__be32 xid, struct nfsd_net *nn)
-{
- return hash_32((__force u32)xid, nn->maskbits);
-}
-
static struct svc_cacherep *
nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum,
struct nfsd_net *nn)
@@ -241,6 +235,14 @@ lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
list_move_tail(&rp->c_lru, &b->lru_head);
}
+static noinline struct nfsd_drc_bucket *
+nfsd_cache_bucket_find(__be32 xid, struct nfsd_net *nn)
+{
+ unsigned int hash = hash_32((__force u32)xid, nn->maskbits);
+
+ return &nn->drc_hashtbl[hash];
+}
+
static long prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn,
unsigned int max)
{
@@ -419,12 +421,10 @@ out:
*/
int nfsd_cache_lookup(struct svc_rqst *rqstp)
{
- struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ struct nfsd_net *nn;
struct svc_cacherep *rp, *found;
- __be32 xid = rqstp->rq_xid;
__wsum csum;
- u32 hash = nfsd_cache_hash(xid, nn);
- struct nfsd_drc_bucket *b = &nn->drc_hashtbl[hash];
+ struct nfsd_drc_bucket *b;
int type = rqstp->rq_cachetype;
int rtn = RC_DOIT;
@@ -440,17 +440,16 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp)
* Since the common case is a cache miss followed by an insert,
* preallocate an entry.
*/
+ nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
rp = nfsd_reply_cache_alloc(rqstp, csum, nn);
if (!rp)
goto out;
+ b = nfsd_cache_bucket_find(rqstp->rq_xid, nn);
spin_lock(&b->cache_lock);
found = nfsd_cache_insert(b, rp, nn);
- if (found != rp) {
- nfsd_reply_cache_free_locked(NULL, rp, nn);
- rp = found;
+ if (found != rp)
goto found_entry;
- }
nfsd_stats_rc_misses_inc();
rqstp->rq_cacherep = rp;
@@ -468,8 +467,10 @@ out:
found_entry:
/* We found a matching entry which is either in progress or done. */
+ nfsd_reply_cache_free_locked(NULL, rp, nn);
nfsd_stats_rc_hits_inc();
rtn = RC_DROPIT;
+ rp = found;
/* Request being processed */
if (rp->c_state == RC_INPROG)
@@ -528,7 +529,6 @@ void nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
struct svc_cacherep *rp = rqstp->rq_cacherep;
struct kvec *resv = &rqstp->rq_res.head[0], *cachv;
- u32 hash;
struct nfsd_drc_bucket *b;
int len;
size_t bufsize = 0;
@@ -536,8 +536,7 @@ void nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
if (!rp)
return;
- hash = nfsd_cache_hash(rp->c_key.k_xid, nn);
- b = &nn->drc_hashtbl[hash];
+ b = nfsd_cache_bucket_find(rp->c_key.k_xid, nn);
len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
len >>= 2;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 68b020f2002b..16920e4512bd 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -772,13 +772,13 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cr
if (err != 0)
return err;
- err = svc_create_xprt(nn->nfsd_serv, transport, net,
- PF_INET, port, SVC_SOCK_ANONYMOUS, cred);
+ err = svc_xprt_create(nn->nfsd_serv, transport, net,
+ PF_INET, port, SVC_SOCK_ANONYMOUS, cred);
if (err < 0)
goto out_err;
- err = svc_create_xprt(nn->nfsd_serv, transport, net,
- PF_INET6, port, SVC_SOCK_ANONYMOUS, cred);
+ err = svc_xprt_create(nn->nfsd_serv, transport, net,
+ PF_INET6, port, SVC_SOCK_ANONYMOUS, cred);
if (err < 0 && err != -EAFNOSUPPORT)
goto out_close;
@@ -790,7 +790,7 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cr
out_close:
xprt = svc_find_xprt(nn->nfsd_serv, transport, net, PF_INET, port);
if (xprt != NULL) {
- svc_close_xprt(xprt);
+ svc_xprt_close(xprt);
svc_xprt_put(xprt);
}
out_err:
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 3e5008b475ff..4fc1fd639527 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -364,7 +364,7 @@ void nfsd_lockd_shutdown(void);
| FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP | FATTR4_WORD1_RAWDEV \
| FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL \
| FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_ACCESS_SET \
- | FATTR4_WORD1_TIME_DELTA | FATTR4_WORD1_TIME_METADATA \
+ | FATTR4_WORD1_TIME_DELTA | FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_CREATE \
| FATTR4_WORD1_TIME_MODIFY | FATTR4_WORD1_TIME_MODIFY_SET | FATTR4_WORD1_MOUNTED_ON_FILEID)
#define NFSD4_SUPPORTED_ATTRS_WORD2 0
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 145208bcb9bd..c29baa03dfaf 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -611,8 +611,6 @@ out_negative:
return nfserr_serverfault;
}
-#ifdef CONFIG_NFSD_V3
-
/**
* fh_fill_pre_attrs - Fill in pre-op attributes
* @fhp: file handle to be updated
@@ -673,8 +671,6 @@ void fh_fill_post_attrs(struct svc_fh *fhp)
nfsd4_change_attribute(&fhp->fh_post_attr, inode);
}
-#endif /* CONFIG_NFSD_V3 */
-
/*
* Release a file handle.
*/
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 434930d8a946..fb9d358a267e 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -90,7 +90,6 @@ typedef struct svc_fh {
* operation
*/
int fh_flags; /* FH flags */
-#ifdef CONFIG_NFSD_V3
bool fh_post_saved; /* post-op attrs saved */
bool fh_pre_saved; /* pre-op attrs saved */
@@ -107,7 +106,6 @@ typedef struct svc_fh {
/* Post-op attributes saved in fh_unlock */
struct kstat fh_post_attr; /* full attrs after operation */
u64 fh_post_change; /* nfsv4 change; see above */
-#endif /* CONFIG_NFSD_V3 */
} svc_fh;
#define NFSD4_FH_FOREIGN (1<<0)
#define SET_FH_FLAG(c, f) ((c)->fh_flags |= (f))
@@ -283,8 +281,6 @@ static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh)
}
#endif
-#ifdef CONFIG_NFSD_V3
-
/**
* fh_clear_pre_post_attrs - Reset pre/post attributes
* @fhp: file handle to be updated
@@ -327,22 +323,6 @@ static inline u64 nfsd4_change_attribute(struct kstat *stat,
extern void fh_fill_pre_attrs(struct svc_fh *fhp);
extern void fh_fill_post_attrs(struct svc_fh *fhp);
-#else /* !CONFIG_NFSD_V3 */
-
-static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp)
-{
-}
-
-static inline void fh_fill_pre_attrs(struct svc_fh *fhp)
-{
-}
-
-static inline void fh_fill_post_attrs(struct svc_fh *fhp)
-{
-}
-
-#endif /* !CONFIG_NFSD_V3 */
-
/*
* Lock a file handle/inode
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 18b8eb43a19b..fcdab8a8a41f 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -230,7 +230,7 @@ nfsd_proc_write(struct svc_rqst *rqstp)
unsigned long cnt = argp->len;
unsigned int nvecs;
- dprintk("nfsd: WRITE %s %d bytes at %d\n",
+ dprintk("nfsd: WRITE %s %u bytes at %d\n",
SVCFH_fmt(&argp->fh),
argp->len, argp->offset);
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index b8c682b62d29..4bb5baa17040 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -117,9 +117,7 @@ static struct svc_stat nfsd_acl_svcstats = {
static const struct svc_version *nfsd_version[] = {
[2] = &nfsd_version2,
-#if defined(CONFIG_NFSD_V3)
[3] = &nfsd_version3,
-#endif
#if defined(CONFIG_NFSD_V4)
[4] = &nfsd_version4,
#endif
@@ -293,13 +291,13 @@ static int nfsd_init_socks(struct net *net, const struct cred *cred)
if (!list_empty(&nn->nfsd_serv->sv_permsocks))
return 0;
- error = svc_create_xprt(nn->nfsd_serv, "udp", net, PF_INET, NFS_PORT,
- SVC_SOCK_DEFAULTS, cred);
+ error = svc_xprt_create(nn->nfsd_serv, "udp", net, PF_INET, NFS_PORT,
+ SVC_SOCK_DEFAULTS, cred);
if (error < 0)
return error;
- error = svc_create_xprt(nn->nfsd_serv, "tcp", net, PF_INET, NFS_PORT,
- SVC_SOCK_DEFAULTS, cred);
+ error = svc_xprt_create(nn->nfsd_serv, "tcp", net, PF_INET, NFS_PORT,
+ SVC_SOCK_DEFAULTS, cred);
if (error < 0)
return error;
@@ -612,13 +610,6 @@ static int nfsd_get_default_max_blksize(void)
return ret;
}
-static const struct svc_serv_ops nfsd_thread_sv_ops = {
- .svo_shutdown = nfsd_last_thread,
- .svo_function = nfsd,
- .svo_enqueue_xprt = svc_xprt_do_enqueue,
- .svo_module = THIS_MODULE,
-};
-
void nfsd_shutdown_threads(struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -657,8 +648,7 @@ int nfsd_create_serv(struct net *net)
if (nfsd_max_blksize == 0)
nfsd_max_blksize = nfsd_get_default_max_blksize();
nfsd_reset_versions(nn);
- serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
- &nfsd_thread_sv_ops);
+ serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, nfsd);
if (serv == NULL)
return -ENOMEM;
@@ -724,7 +714,8 @@ void nfsd_put(struct net *net)
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
if (kref_put(&nn->nfsd_serv->sv_refcnt, nfsd_noop)) {
- svc_shutdown_net(nn->nfsd_serv, net);
+ svc_xprt_destroy_all(nn->nfsd_serv, net);
+ nfsd_last_thread(nn->nfsd_serv, net);
svc_destroy(&nn->nfsd_serv->sv_refcnt);
spin_lock(&nfsd_notifier_lock);
nn->nfsd_serv = NULL;
@@ -1019,8 +1010,6 @@ out:
msleep(20);
}
- /* Release module */
- module_put_and_kthread_exit(0);
return 0;
}
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 5889db66409d..242fa123e0e9 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -13,22 +13,6 @@
#include "export.h"
#include "nfsfh.h"
-#define NFSD_TRACE_PROC_ARG_FIELDS \
- __field(unsigned int, netns_ino) \
- __field(u32, xid) \
- __array(unsigned char, server, sizeof(struct sockaddr_in6)) \
- __array(unsigned char, client, sizeof(struct sockaddr_in6))
-
-#define NFSD_TRACE_PROC_ARG_ASSIGNMENTS \
- do { \
- __entry->netns_ino = SVC_NET(rqstp)->ns.inum; \
- __entry->xid = be32_to_cpu(rqstp->rq_xid); \
- memcpy(__entry->server, &rqstp->rq_xprt->xpt_local, \
- rqstp->rq_xprt->xpt_locallen); \
- memcpy(__entry->client, &rqstp->rq_xprt->xpt_remote, \
- rqstp->rq_xprt->xpt_remotelen); \
- } while (0);
-
#define NFSD_TRACE_PROC_RES_FIELDS \
__field(unsigned int, netns_ino) \
__field(u32, xid) \
@@ -53,16 +37,22 @@ DECLARE_EVENT_CLASS(nfsd_xdr_err_class,
),
TP_ARGS(rqstp),
TP_STRUCT__entry(
- NFSD_TRACE_PROC_ARG_FIELDS
-
+ __field(unsigned int, netns_ino)
+ __field(u32, xid)
__field(u32, vers)
__field(u32, proc)
+ __sockaddr(server, rqstp->rq_xprt->xpt_locallen)
+ __sockaddr(client, rqstp->rq_xprt->xpt_remotelen)
),
TP_fast_assign(
- NFSD_TRACE_PROC_ARG_ASSIGNMENTS
+ const struct svc_xprt *xprt = rqstp->rq_xprt;
+ __entry->netns_ino = xprt->xpt_net->ns.inum;
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
__entry->vers = rqstp->rq_vers;
__entry->proc = rqstp->rq_proc;
+ __assign_sockaddr(server, &xprt->xpt_local, xprt->xpt_locallen);
+ __assign_sockaddr(client, &xprt->xpt_remote, xprt->xpt_remotelen);
),
TP_printk("xid=0x%08x vers=%u proc=%u",
__entry->xid, __entry->vers, __entry->proc
@@ -613,20 +603,21 @@ TRACE_EVENT(nfsd_clid_cred_mismatch,
__field(u32, cl_id)
__field(unsigned long, cl_flavor)
__field(unsigned long, new_flavor)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, rqstp->rq_xprt->xpt_remotelen)
),
TP_fast_assign(
__entry->cl_boot = clp->cl_clientid.cl_boot;
__entry->cl_id = clp->cl_clientid.cl_id;
__entry->cl_flavor = clp->cl_cred.cr_flavor;
__entry->new_flavor = rqstp->rq_cred.cr_flavor;
- memcpy(__entry->addr, &rqstp->rq_xprt->xpt_remote,
- sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &rqstp->rq_xprt->xpt_remote,
+ rqstp->rq_xprt->xpt_remotelen);
),
TP_printk("client %08x:%08x flavor=%s, conflict=%s from addr=%pISpc",
__entry->cl_boot, __entry->cl_id,
show_nfsd_authflavor(__entry->cl_flavor),
- show_nfsd_authflavor(__entry->new_flavor), __entry->addr
+ show_nfsd_authflavor(__entry->new_flavor),
+ __get_sockaddr(addr)
)
)
@@ -642,7 +633,7 @@ TRACE_EVENT(nfsd_clid_verf_mismatch,
__field(u32, cl_id)
__array(unsigned char, cl_verifier, NFS4_VERIFIER_SIZE)
__array(unsigned char, new_verifier, NFS4_VERIFIER_SIZE)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, rqstp->rq_xprt->xpt_remotelen)
),
TP_fast_assign(
__entry->cl_boot = clp->cl_clientid.cl_boot;
@@ -651,14 +642,14 @@ TRACE_EVENT(nfsd_clid_verf_mismatch,
NFS4_VERIFIER_SIZE);
memcpy(__entry->new_verifier, (void *)verf,
NFS4_VERIFIER_SIZE);
- memcpy(__entry->addr, &rqstp->rq_xprt->xpt_remote,
- sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &rqstp->rq_xprt->xpt_remote,
+ rqstp->rq_xprt->xpt_remotelen);
),
TP_printk("client %08x:%08x verf=0x%s, updated=0x%s from addr=%pISpc",
__entry->cl_boot, __entry->cl_id,
__print_hex_str(__entry->cl_verifier, NFS4_VERIFIER_SIZE),
__print_hex_str(__entry->new_verifier, NFS4_VERIFIER_SIZE),
- __entry->addr
+ __get_sockaddr(addr)
)
);
@@ -908,18 +899,17 @@ TRACE_EVENT(nfsd_cb_args,
__field(u32, cl_id)
__field(u32, prog)
__field(u32, ident)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, conn->cb_addrlen)
),
TP_fast_assign(
__entry->cl_boot = clp->cl_clientid.cl_boot;
__entry->cl_id = clp->cl_clientid.cl_id;
__entry->prog = conn->cb_prog;
__entry->ident = conn->cb_ident;
- memcpy(__entry->addr, &conn->cb_addr,
- sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &conn->cb_addr, conn->cb_addrlen);
),
TP_printk("addr=%pISpc client %08x:%08x prog=%u ident=%u",
- __entry->addr, __entry->cl_boot, __entry->cl_id,
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
__entry->prog, __entry->ident)
);
@@ -951,17 +941,17 @@ DECLARE_EVENT_CLASS(nfsd_cb_class,
__field(unsigned long, state)
__field(u32, cl_boot)
__field(u32, cl_id)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, clp->cl_cb_conn.cb_addrlen)
),
TP_fast_assign(
__entry->state = clp->cl_cb_state;
__entry->cl_boot = clp->cl_clientid.cl_boot;
__entry->cl_id = clp->cl_clientid.cl_id;
- memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
- sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
+ clp->cl_cb_conn.cb_addrlen)
),
TP_printk("addr=%pISpc client %08x:%08x state=%s",
- __entry->addr, __entry->cl_boot, __entry->cl_id,
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
show_cb_state(__entry->state))
);
@@ -1001,7 +991,7 @@ TRACE_EVENT(nfsd_cb_setup,
__field(u32, cl_boot)
__field(u32, cl_id)
__field(unsigned long, authflavor)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, clp->cl_cb_conn.cb_addrlen)
__array(unsigned char, netid, 8)
),
TP_fast_assign(
@@ -1009,11 +999,11 @@ TRACE_EVENT(nfsd_cb_setup,
__entry->cl_id = clp->cl_clientid.cl_id;
strlcpy(__entry->netid, netid, sizeof(__entry->netid));
__entry->authflavor = authflavor;
- memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
- sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
+ clp->cl_cb_conn.cb_addrlen)
),
TP_printk("addr=%pISpc client %08x:%08x proto=%s flavor=%s",
- __entry->addr, __entry->cl_boot, __entry->cl_id,
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
__entry->netid, show_nfsd_authflavor(__entry->authflavor))
);
@@ -1027,30 +1017,32 @@ TRACE_EVENT(nfsd_cb_setup_err,
__field(long, error)
__field(u32, cl_boot)
__field(u32, cl_id)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, clp->cl_cb_conn.cb_addrlen)
),
TP_fast_assign(
__entry->error = error;
__entry->cl_boot = clp->cl_clientid.cl_boot;
__entry->cl_id = clp->cl_clientid.cl_id;
- memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
- sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
+ clp->cl_cb_conn.cb_addrlen)
),
TP_printk("addr=%pISpc client %08x:%08x error=%ld",
- __entry->addr, __entry->cl_boot, __entry->cl_id, __entry->error)
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
+ __entry->error)
);
-TRACE_EVENT(nfsd_cb_recall,
+TRACE_EVENT_CONDITION(nfsd_cb_recall,
TP_PROTO(
const struct nfs4_stid *stid
),
TP_ARGS(stid),
+ TP_CONDITION(stid->sc_client),
TP_STRUCT__entry(
__field(u32, cl_boot)
__field(u32, cl_id)
__field(u32, si_id)
__field(u32, si_generation)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, stid->sc_client->cl_cb_conn.cb_addrlen)
),
TP_fast_assign(
const stateid_t *stp = &stid->sc_stateid;
@@ -1060,14 +1052,11 @@ TRACE_EVENT(nfsd_cb_recall,
__entry->cl_id = stp->si_opaque.so_clid.cl_id;
__entry->si_id = stp->si_opaque.so_id;
__entry->si_generation = stp->si_generation;
- if (clp)
- memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
- sizeof(struct sockaddr_in6));
- else
- memset(__entry->addr, 0, sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
+ clp->cl_cb_conn.cb_addrlen)
),
TP_printk("addr=%pISpc client %08x:%08x stateid %08x:%08x",
- __entry->addr, __entry->cl_boot, __entry->cl_id,
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
__entry->si_id, __entry->si_generation)
);
@@ -1081,7 +1070,7 @@ TRACE_EVENT(nfsd_cb_notify_lock,
__field(u32, cl_boot)
__field(u32, cl_id)
__field(u32, fh_hash)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, lo->lo_owner.so_client->cl_cb_conn.cb_addrlen)
),
TP_fast_assign(
const struct nfs4_client *clp = lo->lo_owner.so_client;
@@ -1089,11 +1078,11 @@ TRACE_EVENT(nfsd_cb_notify_lock,
__entry->cl_boot = clp->cl_clientid.cl_boot;
__entry->cl_id = clp->cl_clientid.cl_id;
__entry->fh_hash = knfsd_fh_hash(&nbl->nbl_fh);
- memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
- sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
+ clp->cl_cb_conn.cb_addrlen)
),
TP_printk("addr=%pISpc client %08x:%08x fh_hash=0x%08x",
- __entry->addr, __entry->cl_boot, __entry->cl_id,
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
__entry->fh_hash)
);
@@ -1114,7 +1103,7 @@ TRACE_EVENT(nfsd_cb_offload,
__field(u32, fh_hash)
__field(int, status)
__field(u64, count)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, clp->cl_cb_conn.cb_addrlen)
),
TP_fast_assign(
__entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
@@ -1124,11 +1113,11 @@ TRACE_EVENT(nfsd_cb_offload,
__entry->fh_hash = knfsd_fh_hash(fh);
__entry->status = be32_to_cpu(status);
__entry->count = count;
- memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
- sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
+ clp->cl_cb_conn.cb_addrlen)
),
TP_printk("addr=%pISpc client %08x:%08x stateid %08x:%08x fh_hash=0x%08x count=%llu status=%d",
- __entry->addr, __entry->cl_boot, __entry->cl_id,
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
__entry->si_id, __entry->si_generation,
__entry->fh_hash, __entry->count, __entry->status)
);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 91600e71be19..c22ad0532e8e 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -26,15 +26,14 @@
#include <linux/xattr.h>
#include <linux/jhash.h>
#include <linux/ima.h>
+#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/exportfs.h>
#include <linux/writeback.h>
#include <linux/security.h>
-#ifdef CONFIG_NFSD_V3
#include "xdr3.h"
-#endif /* CONFIG_NFSD_V3 */
#ifdef CONFIG_NFSD_V4
#include "../internal.h"
@@ -608,7 +607,6 @@ __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
}
#endif /* defined(CONFIG_NFSD_V4) */
-#ifdef CONFIG_NFSD_V3
/*
* Check server access rights to a file system object
*/
@@ -720,7 +718,6 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor
out:
return error;
}
-#endif /* CONFIG_NFSD_V3 */
int nfsd_open_break_lease(struct inode *inode, int access)
{
@@ -1113,7 +1110,6 @@ out:
return err;
}
-#ifdef CONFIG_NFSD_V3
/**
* nfsd_commit - Commit pending writes to stable storage
* @rqstp: RPC request being processed
@@ -1190,7 +1186,6 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset,
out:
return err;
}
-#endif /* CONFIG_NFSD_V3 */
static __be32
nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
@@ -1380,8 +1375,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
rdev, resfhp);
}
-#ifdef CONFIG_NFSD_V3
-
/*
* NFSv3 and NFSv4 version of nfsd_create
*/
@@ -1547,7 +1540,6 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
err = nfserrno(host_err);
goto out;
}
-#endif /* CONFIG_NFSD_V3 */
/*
* Read a symlink. On entry, *lenp must contain the maximum path length that
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 2c43d10e3cab..ccb87b2864f6 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -68,7 +68,6 @@ __be32 nfsd_create_locked(struct svc_rqst *, struct svc_fh *,
__be32 nfsd_create(struct svc_rqst *, struct svc_fh *,
char *name, int len, struct iattr *attrs,
int type, dev_t rdev, struct svc_fh *res);
-#ifdef CONFIG_NFSD_V3
__be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
__be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *,
char *name, int len, struct iattr *attrs,
@@ -76,7 +75,6 @@ __be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *,
u32 *verifier, bool *truncp, bool *created);
__be32 nfsd_commit(struct svc_rqst *rqst, struct svc_fh *fhp,
u64 offset, u32 count, __be32 *verf);
-#endif /* CONFIG_NFSD_V3 */
#ifdef CONFIG_NFSD_V4
__be32 nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
char *name, void **bufp, int *lenp);
diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h
index 528fb299430e..852f71580bd0 100644
--- a/fs/nfsd/xdr.h
+++ b/fs/nfsd/xdr.h
@@ -32,7 +32,7 @@ struct nfsd_readargs {
struct nfsd_writeargs {
svc_fh fh;
__u32 offset;
- int len;
+ __u32 len;
struct xdr_buf payload;
};
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 66bdaa2cf496..ca611ac09f7c 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -20,6 +20,23 @@
#include "page.h"
#include "btnode.h"
+
+/**
+ * nilfs_init_btnc_inode - initialize B-tree node cache inode
+ * @btnc_inode: inode to be initialized
+ *
+ * nilfs_init_btnc_inode() sets up an inode for B-tree node cache.
+ */
+void nilfs_init_btnc_inode(struct inode *btnc_inode)
+{
+ struct nilfs_inode_info *ii = NILFS_I(btnc_inode);
+
+ btnc_inode->i_mode = S_IFREG;
+ ii->i_flags = 0;
+ memset(&ii->i_bmap_data, 0, sizeof(struct nilfs_bmap));
+ mapping_set_gfp_mask(btnc_inode->i_mapping, GFP_NOFS);
+}
+
void nilfs_btnode_cache_clear(struct address_space *btnc)
{
invalidate_mapping_pages(btnc, 0, -1);
@@ -29,7 +46,7 @@ void nilfs_btnode_cache_clear(struct address_space *btnc)
struct buffer_head *
nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
{
- struct inode *inode = NILFS_BTNC_I(btnc);
+ struct inode *inode = btnc->host;
struct buffer_head *bh;
bh = nilfs_grab_buffer(inode, btnc, blocknr, BIT(BH_NILFS_Node));
@@ -57,7 +74,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
struct buffer_head **pbh, sector_t *submit_ptr)
{
struct buffer_head *bh;
- struct inode *inode = NILFS_BTNC_I(btnc);
+ struct inode *inode = btnc->host;
struct page *page;
int err;
@@ -157,7 +174,7 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc,
struct nilfs_btnode_chkey_ctxt *ctxt)
{
struct buffer_head *obh, *nbh;
- struct inode *inode = NILFS_BTNC_I(btnc);
+ struct inode *inode = btnc->host;
__u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
int err;
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 11663650add7..bd5544e63a01 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -30,6 +30,7 @@ struct nilfs_btnode_chkey_ctxt {
struct buffer_head *newbh;
};
+void nilfs_init_btnc_inode(struct inode *btnc_inode);
void nilfs_btnode_cache_clear(struct address_space *);
struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
__u64 blocknr);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 3594eabe1419..f544c22fff78 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -58,7 +58,8 @@ static void nilfs_btree_free_path(struct nilfs_btree_path *path)
static int nilfs_btree_get_new_block(const struct nilfs_bmap *btree,
__u64 ptr, struct buffer_head **bhp)
{
- struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache;
+ struct inode *btnc_inode = NILFS_BMAP_I(btree)->i_assoc_inode;
+ struct address_space *btnc = btnc_inode->i_mapping;
struct buffer_head *bh;
bh = nilfs_btnode_create_block(btnc, ptr);
@@ -470,7 +471,8 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
struct buffer_head **bhp,
const struct nilfs_btree_readahead_info *ra)
{
- struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache;
+ struct inode *btnc_inode = NILFS_BMAP_I(btree)->i_assoc_inode;
+ struct address_space *btnc = btnc_inode->i_mapping;
struct buffer_head *bh, *ra_bh;
sector_t submit_ptr = 0;
int ret;
@@ -1741,6 +1743,10 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *btree, __u64 key,
dat = nilfs_bmap_get_dat(btree);
}
+ ret = nilfs_attach_btree_node_cache(&NILFS_BMAP_I(btree)->vfs_inode);
+ if (ret < 0)
+ return ret;
+
ret = nilfs_bmap_prepare_alloc_ptr(btree, dreq, dat);
if (ret < 0)
return ret;
@@ -1913,7 +1919,7 @@ static int nilfs_btree_prepare_update_v(struct nilfs_bmap *btree,
path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr;
path[level].bp_ctxt.bh = path[level].bp_bh;
ret = nilfs_btnode_prepare_change_key(
- &NILFS_BMAP_I(btree)->i_btnode_cache,
+ NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping,
&path[level].bp_ctxt);
if (ret < 0) {
nilfs_dat_abort_update(dat,
@@ -1939,7 +1945,7 @@ static void nilfs_btree_commit_update_v(struct nilfs_bmap *btree,
if (buffer_nilfs_node(path[level].bp_bh)) {
nilfs_btnode_commit_change_key(
- &NILFS_BMAP_I(btree)->i_btnode_cache,
+ NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping,
&path[level].bp_ctxt);
path[level].bp_bh = path[level].bp_ctxt.bh;
}
@@ -1958,7 +1964,7 @@ static void nilfs_btree_abort_update_v(struct nilfs_bmap *btree,
&path[level].bp_newreq.bpr_req);
if (buffer_nilfs_node(path[level].bp_bh))
nilfs_btnode_abort_change_key(
- &NILFS_BMAP_I(btree)->i_btnode_cache,
+ NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping,
&path[level].bp_ctxt);
}
@@ -2134,7 +2140,8 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_bmap *btree,
static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree,
struct list_head *listp)
{
- struct address_space *btcache = &NILFS_BMAP_I(btree)->i_btnode_cache;
+ struct inode *btnc_inode = NILFS_BMAP_I(btree)->i_assoc_inode;
+ struct address_space *btcache = btnc_inode->i_mapping;
struct list_head lists[NILFS_BTREE_LEVEL_MAX];
struct pagevec pvec;
struct buffer_head *bh, *head;
@@ -2188,12 +2195,12 @@ static int nilfs_btree_assign_p(struct nilfs_bmap *btree,
path[level].bp_ctxt.newkey = blocknr;
path[level].bp_ctxt.bh = *bh;
ret = nilfs_btnode_prepare_change_key(
- &NILFS_BMAP_I(btree)->i_btnode_cache,
+ NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping,
&path[level].bp_ctxt);
if (ret < 0)
return ret;
nilfs_btnode_commit_change_key(
- &NILFS_BMAP_I(btree)->i_btnode_cache,
+ NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping,
&path[level].bp_ctxt);
*bh = path[level].bp_ctxt.bh;
}
@@ -2398,6 +2405,10 @@ int nilfs_btree_init(struct nilfs_bmap *bmap)
if (nilfs_btree_root_broken(nilfs_btree_get_root(bmap), bmap->b_inode))
ret = -EIO;
+ else
+ ret = nilfs_attach_btree_node_cache(
+ &NILFS_BMAP_I(bmap)->vfs_inode);
+
return ret;
}
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index dc51d3b7a7bf..3b55e239705f 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -497,7 +497,9 @@ int nilfs_dat_read(struct super_block *sb, size_t entry_size,
di = NILFS_DAT_I(dat);
lockdep_set_class(&di->mi.mi_sem, &dat_lock_key);
nilfs_palloc_setup_cache(dat, &di->palloc_cache);
- nilfs_mdt_setup_shadow_map(dat, &di->shadow);
+ err = nilfs_mdt_setup_shadow_map(dat, &di->shadow);
+ if (err)
+ goto failed;
err = nilfs_read_inode_common(dat, raw_inode);
if (err)
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index a8f5315f01e3..04fdd420eae7 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -126,9 +126,10 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn,
__u64 vbn, struct buffer_head **out_bh)
{
+ struct inode *btnc_inode = NILFS_I(inode)->i_assoc_inode;
int ret;
- ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache,
+ ret = nilfs_btnode_submit_block(btnc_inode->i_mapping,
vbn ? : pbn, pbn, REQ_OP_READ, 0,
out_bh, &pbn);
if (ret == -EEXIST) /* internal code (cache hit) */
@@ -170,7 +171,7 @@ int nilfs_init_gcinode(struct inode *inode)
ii->i_flags = 0;
nilfs_bmap_init_gc(ii->i_bmap);
- return 0;
+ return nilfs_attach_btree_node_cache(inode);
}
/**
@@ -185,7 +186,7 @@ void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs)
ii = list_first_entry(head, struct nilfs_inode_info, i_dirty);
list_del_init(&ii->i_dirty);
truncate_inode_pages(&ii->vfs_inode.i_data, 0);
- nilfs_btnode_cache_clear(&ii->i_btnode_cache);
+ nilfs_btnode_cache_clear(ii->i_assoc_inode->i_mapping);
iput(&ii->vfs_inode);
}
}
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index e3d807d5b83a..6045cea21f52 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -29,12 +29,16 @@
* @cno: checkpoint number
* @root: pointer on NILFS root object (mounted checkpoint)
* @for_gc: inode for GC flag
+ * @for_btnc: inode for B-tree node cache flag
+ * @for_shadow: inode for shadowed page cache flag
*/
struct nilfs_iget_args {
u64 ino;
__u64 cno;
struct nilfs_root *root;
- int for_gc;
+ bool for_gc;
+ bool for_btnc;
+ bool for_shadow;
};
static int nilfs_iget_test(struct inode *inode, void *opaque);
@@ -199,23 +203,22 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
return 0;
}
-static int nilfs_set_page_dirty(struct page *page)
+static bool nilfs_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
{
- struct inode *inode = page->mapping->host;
- int ret = __set_page_dirty_nobuffers(page);
+ struct inode *inode = mapping->host;
+ struct buffer_head *head;
+ unsigned int nr_dirty = 0;
+ bool ret = filemap_dirty_folio(mapping, folio);
- if (page_has_buffers(page)) {
- unsigned int nr_dirty = 0;
- struct buffer_head *bh, *head;
+ /*
+ * The page may not be locked, eg if called from try_to_unmap_one()
+ */
+ spin_lock(&mapping->private_lock);
+ head = folio_buffers(folio);
+ if (head) {
+ struct buffer_head *bh = head;
- /*
- * This page is locked by callers, and no other thread
- * concurrently marks its buffers dirty since they are
- * only dirtied through routines in fs/buffer.c in
- * which call sites of mark_buffer_dirty are protected
- * by page lock.
- */
- bh = head = page_buffers(page);
do {
/* Do not mark hole blocks dirty */
if (buffer_dirty(bh) || !buffer_mapped(bh))
@@ -224,14 +227,13 @@ static int nilfs_set_page_dirty(struct page *page)
set_buffer_dirty(bh);
nr_dirty++;
} while (bh = bh->b_this_page, bh != head);
-
- if (nr_dirty)
- nilfs_set_file_dirty(inode, nr_dirty);
} else if (ret) {
- unsigned int nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
+ nr_dirty = 1 << (folio_shift(folio) - inode->i_blkbits);
+ }
+ spin_unlock(&mapping->private_lock);
+ if (nr_dirty)
nilfs_set_file_dirty(inode, nr_dirty);
- }
return ret;
}
@@ -299,12 +301,12 @@ const struct address_space_operations nilfs_aops = {
.writepage = nilfs_writepage,
.readpage = nilfs_readpage,
.writepages = nilfs_writepages,
- .set_page_dirty = nilfs_set_page_dirty,
+ .dirty_folio = nilfs_dirty_folio,
.readahead = nilfs_readahead,
.write_begin = nilfs_write_begin,
.write_end = nilfs_write_end,
/* .releasepage = nilfs_releasepage, */
- .invalidatepage = block_invalidatepage,
+ .invalidate_folio = block_invalidate_folio,
.direct_IO = nilfs_direct_IO,
.is_partially_uptodate = block_is_partially_uptodate,
};
@@ -314,7 +316,8 @@ static int nilfs_insert_inode_locked(struct inode *inode,
unsigned long ino)
{
struct nilfs_iget_args args = {
- .ino = ino, .root = root, .cno = 0, .for_gc = 0
+ .ino = ino, .root = root, .cno = 0, .for_gc = false,
+ .for_btnc = false, .for_shadow = false
};
return insert_inode_locked4(inode, ino, nilfs_iget_test, &args);
@@ -527,6 +530,19 @@ static int nilfs_iget_test(struct inode *inode, void *opaque)
return 0;
ii = NILFS_I(inode);
+ if (test_bit(NILFS_I_BTNC, &ii->i_state)) {
+ if (!args->for_btnc)
+ return 0;
+ } else if (args->for_btnc) {
+ return 0;
+ }
+ if (test_bit(NILFS_I_SHADOW, &ii->i_state)) {
+ if (!args->for_shadow)
+ return 0;
+ } else if (args->for_shadow) {
+ return 0;
+ }
+
if (!test_bit(NILFS_I_GCINODE, &ii->i_state))
return !args->for_gc;
@@ -538,15 +554,17 @@ static int nilfs_iget_set(struct inode *inode, void *opaque)
struct nilfs_iget_args *args = opaque;
inode->i_ino = args->ino;
- if (args->for_gc) {
+ NILFS_I(inode)->i_cno = args->cno;
+ NILFS_I(inode)->i_root = args->root;
+ if (args->root && args->ino == NILFS_ROOT_INO)
+ nilfs_get_root(args->root);
+
+ if (args->for_gc)
NILFS_I(inode)->i_state = BIT(NILFS_I_GCINODE);
- NILFS_I(inode)->i_cno = args->cno;
- NILFS_I(inode)->i_root = NULL;
- } else {
- if (args->root && args->ino == NILFS_ROOT_INO)
- nilfs_get_root(args->root);
- NILFS_I(inode)->i_root = args->root;
- }
+ if (args->for_btnc)
+ NILFS_I(inode)->i_state |= BIT(NILFS_I_BTNC);
+ if (args->for_shadow)
+ NILFS_I(inode)->i_state |= BIT(NILFS_I_SHADOW);
return 0;
}
@@ -554,7 +572,8 @@ struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
unsigned long ino)
{
struct nilfs_iget_args args = {
- .ino = ino, .root = root, .cno = 0, .for_gc = 0
+ .ino = ino, .root = root, .cno = 0, .for_gc = false,
+ .for_btnc = false, .for_shadow = false
};
return ilookup5(sb, ino, nilfs_iget_test, &args);
@@ -564,7 +583,8 @@ struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
unsigned long ino)
{
struct nilfs_iget_args args = {
- .ino = ino, .root = root, .cno = 0, .for_gc = 0
+ .ino = ino, .root = root, .cno = 0, .for_gc = false,
+ .for_btnc = false, .for_shadow = false
};
return iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
@@ -595,7 +615,8 @@ struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino,
__u64 cno)
{
struct nilfs_iget_args args = {
- .ino = ino, .root = NULL, .cno = cno, .for_gc = 1
+ .ino = ino, .root = NULL, .cno = cno, .for_gc = true,
+ .for_btnc = false, .for_shadow = false
};
struct inode *inode;
int err;
@@ -615,6 +636,113 @@ struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino,
return inode;
}
+/**
+ * nilfs_attach_btree_node_cache - attach a B-tree node cache to the inode
+ * @inode: inode object
+ *
+ * nilfs_attach_btree_node_cache() attaches a B-tree node cache to @inode,
+ * or does nothing if the inode already has it. This function allocates
+ * an additional inode to maintain page cache of B-tree nodes one-on-one.
+ *
+ * Return Value: On success, 0 is returned. On errors, one of the following
+ * negative error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_attach_btree_node_cache(struct inode *inode)
+{
+ struct nilfs_inode_info *ii = NILFS_I(inode);
+ struct inode *btnc_inode;
+ struct nilfs_iget_args args;
+
+ if (ii->i_assoc_inode)
+ return 0;
+
+ args.ino = inode->i_ino;
+ args.root = ii->i_root;
+ args.cno = ii->i_cno;
+ args.for_gc = test_bit(NILFS_I_GCINODE, &ii->i_state) != 0;
+ args.for_btnc = true;
+ args.for_shadow = test_bit(NILFS_I_SHADOW, &ii->i_state) != 0;
+
+ btnc_inode = iget5_locked(inode->i_sb, inode->i_ino, nilfs_iget_test,
+ nilfs_iget_set, &args);
+ if (unlikely(!btnc_inode))
+ return -ENOMEM;
+ if (btnc_inode->i_state & I_NEW) {
+ nilfs_init_btnc_inode(btnc_inode);
+ unlock_new_inode(btnc_inode);
+ }
+ NILFS_I(btnc_inode)->i_assoc_inode = inode;
+ NILFS_I(btnc_inode)->i_bmap = ii->i_bmap;
+ ii->i_assoc_inode = btnc_inode;
+
+ return 0;
+}
+
+/**
+ * nilfs_detach_btree_node_cache - detach the B-tree node cache from the inode
+ * @inode: inode object
+ *
+ * nilfs_detach_btree_node_cache() detaches the B-tree node cache and its
+ * holder inode bound to @inode, or does nothing if @inode doesn't have it.
+ */
+void nilfs_detach_btree_node_cache(struct inode *inode)
+{
+ struct nilfs_inode_info *ii = NILFS_I(inode);
+ struct inode *btnc_inode = ii->i_assoc_inode;
+
+ if (btnc_inode) {
+ NILFS_I(btnc_inode)->i_assoc_inode = NULL;
+ ii->i_assoc_inode = NULL;
+ iput(btnc_inode);
+ }
+}
+
+/**
+ * nilfs_iget_for_shadow - obtain inode for shadow mapping
+ * @inode: inode object that uses shadow mapping
+ *
+ * nilfs_iget_for_shadow() allocates a pair of inodes that holds page
+ * caches for shadow mapping. The page cache for data pages is set up
+ * in one inode and the one for b-tree node pages is set up in the
+ * other inode, which is attached to the former inode.
+ *
+ * Return Value: On success, a pointer to the inode for data pages is
+ * returned. On errors, one of the following negative error code is returned
+ * in a pointer type.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+struct inode *nilfs_iget_for_shadow(struct inode *inode)
+{
+ struct nilfs_iget_args args = {
+ .ino = inode->i_ino, .root = NULL, .cno = 0, .for_gc = false,
+ .for_btnc = false, .for_shadow = true
+ };
+ struct inode *s_inode;
+ int err;
+
+ s_inode = iget5_locked(inode->i_sb, inode->i_ino, nilfs_iget_test,
+ nilfs_iget_set, &args);
+ if (unlikely(!s_inode))
+ return ERR_PTR(-ENOMEM);
+ if (!(s_inode->i_state & I_NEW))
+ return inode;
+
+ NILFS_I(s_inode)->i_flags = 0;
+ memset(NILFS_I(s_inode)->i_bmap, 0, sizeof(struct nilfs_bmap));
+ mapping_set_gfp_mask(s_inode->i_mapping, GFP_NOFS);
+
+ err = nilfs_attach_btree_node_cache(s_inode);
+ if (unlikely(err)) {
+ iget_failed(s_inode);
+ return ERR_PTR(err);
+ }
+ unlock_new_inode(s_inode);
+ return s_inode;
+}
+
void nilfs_write_inode_common(struct inode *inode,
struct nilfs_inode *raw_inode, int has_bmap)
{
@@ -762,7 +890,8 @@ static void nilfs_clear_inode(struct inode *inode)
if (test_bit(NILFS_I_BMAP, &ii->i_state))
nilfs_bmap_clear(ii->i_bmap);
- nilfs_btnode_cache_clear(&ii->i_btnode_cache);
+ if (!test_bit(NILFS_I_BTNC, &ii->i_state))
+ nilfs_detach_btree_node_cache(inode);
if (ii->i_root && inode->i_ino == NILFS_ROOT_INO)
nilfs_put_root(ii->i_root);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 4b3d33cf0041..d29a0f2b9c16 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -434,7 +434,8 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
static const struct address_space_operations def_mdt_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.writepage = nilfs_mdt_write_page,
};
@@ -470,9 +471,18 @@ int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz)
void nilfs_mdt_clear(struct inode *inode)
{
struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
+ struct nilfs_shadow_map *shadow = mdi->mi_shadow;
if (mdi->mi_palloc_cache)
nilfs_palloc_destroy_cache(inode);
+
+ if (shadow) {
+ struct inode *s_inode = shadow->inode;
+
+ shadow->inode = NULL;
+ iput(s_inode);
+ mdi->mi_shadow = NULL;
+ }
}
/**
@@ -506,12 +516,15 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode,
struct nilfs_shadow_map *shadow)
{
struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+ struct inode *s_inode;
INIT_LIST_HEAD(&shadow->frozen_buffers);
- address_space_init_once(&shadow->frozen_data);
- nilfs_mapping_init(&shadow->frozen_data, inode);
- address_space_init_once(&shadow->frozen_btnodes);
- nilfs_mapping_init(&shadow->frozen_btnodes, inode);
+
+ s_inode = nilfs_iget_for_shadow(inode);
+ if (IS_ERR(s_inode))
+ return PTR_ERR(s_inode);
+
+ shadow->inode = s_inode;
mi->mi_shadow = shadow;
return 0;
}
@@ -525,14 +538,15 @@ int nilfs_mdt_save_to_shadow_map(struct inode *inode)
struct nilfs_mdt_info *mi = NILFS_MDT(inode);
struct nilfs_inode_info *ii = NILFS_I(inode);
struct nilfs_shadow_map *shadow = mi->mi_shadow;
+ struct inode *s_inode = shadow->inode;
int ret;
- ret = nilfs_copy_dirty_pages(&shadow->frozen_data, inode->i_mapping);
+ ret = nilfs_copy_dirty_pages(s_inode->i_mapping, inode->i_mapping);
if (ret)
goto out;
- ret = nilfs_copy_dirty_pages(&shadow->frozen_btnodes,
- &ii->i_btnode_cache);
+ ret = nilfs_copy_dirty_pages(NILFS_I(s_inode)->i_assoc_inode->i_mapping,
+ ii->i_assoc_inode->i_mapping);
if (ret)
goto out;
@@ -548,7 +562,7 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh)
struct page *page;
int blkbits = inode->i_blkbits;
- page = grab_cache_page(&shadow->frozen_data, bh->b_page->index);
+ page = grab_cache_page(shadow->inode->i_mapping, bh->b_page->index);
if (!page)
return -ENOMEM;
@@ -580,7 +594,7 @@ nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh)
struct page *page;
int n;
- page = find_lock_page(&shadow->frozen_data, bh->b_page->index);
+ page = find_lock_page(shadow->inode->i_mapping, bh->b_page->index);
if (page) {
if (page_has_buffers(page)) {
n = bh_offset(bh) >> inode->i_blkbits;
@@ -621,10 +635,11 @@ void nilfs_mdt_restore_from_shadow_map(struct inode *inode)
nilfs_palloc_clear_cache(inode);
nilfs_clear_dirty_pages(inode->i_mapping, true);
- nilfs_copy_back_pages(inode->i_mapping, &shadow->frozen_data);
+ nilfs_copy_back_pages(inode->i_mapping, shadow->inode->i_mapping);
- nilfs_clear_dirty_pages(&ii->i_btnode_cache, true);
- nilfs_copy_back_pages(&ii->i_btnode_cache, &shadow->frozen_btnodes);
+ nilfs_clear_dirty_pages(ii->i_assoc_inode->i_mapping, true);
+ nilfs_copy_back_pages(ii->i_assoc_inode->i_mapping,
+ NILFS_I(shadow->inode)->i_assoc_inode->i_mapping);
nilfs_bmap_restore(ii->i_bmap, &shadow->bmap_store);
@@ -639,10 +654,11 @@ void nilfs_mdt_clear_shadow_map(struct inode *inode)
{
struct nilfs_mdt_info *mi = NILFS_MDT(inode);
struct nilfs_shadow_map *shadow = mi->mi_shadow;
+ struct inode *shadow_btnc_inode = NILFS_I(shadow->inode)->i_assoc_inode;
down_write(&mi->mi_sem);
nilfs_release_frozen_buffers(shadow);
- truncate_inode_pages(&shadow->frozen_data, 0);
- truncate_inode_pages(&shadow->frozen_btnodes, 0);
+ truncate_inode_pages(shadow->inode->i_mapping, 0);
+ truncate_inode_pages(shadow_btnc_inode->i_mapping, 0);
up_write(&mi->mi_sem);
}
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index 8f86080a436d..9e23bab3ff12 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -18,14 +18,12 @@
/**
* struct nilfs_shadow_map - shadow mapping of meta data file
* @bmap_store: shadow copy of bmap state
- * @frozen_data: shadowed dirty data pages
- * @frozen_btnodes: shadowed dirty b-tree nodes' pages
+ * @inode: holder of page caches used in shadow mapping
* @frozen_buffers: list of frozen buffers
*/
struct nilfs_shadow_map {
struct nilfs_bmap_store bmap_store;
- struct address_space frozen_data;
- struct address_space frozen_btnodes;
+ struct inode *inode;
struct list_head frozen_buffers;
};
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index a7b81755c350..1344f7d475d3 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -28,7 +28,7 @@
* @i_xattr: <TODO>
* @i_dir_start_lookup: page index of last successful search
* @i_cno: checkpoint number for GC inode
- * @i_btnode_cache: cached pages of b-tree nodes
+ * @i_assoc_inode: associated inode (B-tree node cache holder or back pointer)
* @i_dirty: list for connecting dirty files
* @xattr_sem: semaphore for extended attributes processing
* @i_bh: buffer contains disk inode
@@ -43,7 +43,7 @@ struct nilfs_inode_info {
__u64 i_xattr; /* sector_t ??? */
__u32 i_dir_start_lookup;
__u64 i_cno; /* check point number for GC inode */
- struct address_space i_btnode_cache;
+ struct inode *i_assoc_inode;
struct list_head i_dirty; /* List for connecting dirty files */
#ifdef CONFIG_NILFS_XATTR
@@ -75,13 +75,6 @@ NILFS_BMAP_I(const struct nilfs_bmap *bmap)
return container_of(bmap, struct nilfs_inode_info, i_bmap_data);
}
-static inline struct inode *NILFS_BTNC_I(struct address_space *btnc)
-{
- struct nilfs_inode_info *ii =
- container_of(btnc, struct nilfs_inode_info, i_btnode_cache);
- return &ii->vfs_inode;
-}
-
/*
* Dynamic state flags of NILFS on-memory inode (i_state)
*/
@@ -98,6 +91,8 @@ enum {
NILFS_I_INODE_SYNC, /* dsync is not allowed for inode */
NILFS_I_BMAP, /* has bmap and btnode_cache */
NILFS_I_GCINODE, /* inode for GC, on memory only */
+ NILFS_I_BTNC, /* inode for btree node cache */
+ NILFS_I_SHADOW, /* inode for shadowed page cache */
};
/*
@@ -267,6 +262,9 @@ struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
unsigned long ino);
extern struct inode *nilfs_iget_for_gc(struct super_block *sb,
unsigned long ino, __u64 cno);
+int nilfs_attach_btree_node_cache(struct inode *inode);
+void nilfs_detach_btree_node_cache(struct inode *inode);
+struct inode *nilfs_iget_for_shadow(struct inode *inode);
extern void nilfs_update_inode(struct inode *, struct buffer_head *, int);
extern void nilfs_truncate(struct inode *);
extern void nilfs_evict_inode(struct inode *);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 063dd16d75b5..a8e88cc38e16 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -436,22 +436,12 @@ unsigned int nilfs_page_count_clean_buffers(struct page *page,
return nc;
}
-void nilfs_mapping_init(struct address_space *mapping, struct inode *inode)
-{
- mapping->host = inode;
- mapping->flags = 0;
- mapping_set_gfp_mask(mapping, GFP_NOFS);
- mapping->private_data = NULL;
- mapping->a_ops = &empty_aops;
-}
-
/*
* NILFS2 needs clear_page_dirty() in the following two cases:
*
- * 1) For B-tree node pages and data pages of the dat/gcdat, NILFS2 clears
- * page dirty flags when it copies back pages from the shadow cache
- * (gcdat->{i_mapping,i_btnode_cache}) to its original cache
- * (dat->{i_mapping,i_btnode_cache}).
+ * 1) For B-tree node pages and data pages of DAT file, NILFS2 clears dirty
+ * flag of pages when it copies back pages from shadow cache to the
+ * original cache.
*
* 2) Some B-tree operations like insertion or deletion may dispose buffers
* in dirty state, and this needs to cancel the dirty state of their pages.
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index 569263b23c0c..21ddcdd4d63e 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -43,7 +43,6 @@ int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
void nilfs_copy_back_pages(struct address_space *, struct address_space *);
void nilfs_clear_dirty_page(struct page *, bool);
void nilfs_clear_dirty_pages(struct address_space *, bool);
-void nilfs_mapping_init(struct address_space *mapping, struct inode *inode);
unsigned int nilfs_page_count_clean_buffers(struct page *, unsigned int,
unsigned int);
unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 43287b0d3e9b..1362ccb64ec7 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -337,26 +337,12 @@ static void nilfs_end_bio_write(struct bio *bio)
}
static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
- struct nilfs_write_info *wi, int mode,
- int mode_flags)
+ struct nilfs_write_info *wi)
{
struct bio *bio = wi->bio;
- int err;
-
- if (segbuf->sb_nbio > 0 &&
- bdi_write_congested(segbuf->sb_super->s_bdi)) {
- wait_for_completion(&segbuf->sb_bio_event);
- segbuf->sb_nbio--;
- if (unlikely(atomic_read(&segbuf->sb_err))) {
- bio_put(bio);
- err = -EIO;
- goto failed;
- }
- }
bio->bi_end_io = nilfs_end_bio_write;
bio->bi_private = segbuf;
- bio_set_op_attrs(bio, mode, mode_flags);
submit_bio(bio);
segbuf->sb_nbio++;
@@ -365,33 +351,6 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
wi->start = wi->end;
return 0;
-
- failed:
- wi->bio = NULL;
- return err;
-}
-
-/**
- * nilfs_alloc_seg_bio - allocate a new bio for writing log
- * @nilfs: nilfs object
- * @start: start block number of the bio
- * @nr_vecs: request size of page vector.
- *
- * Return Value: On success, pointer to the struct bio is returned.
- * On error, NULL is returned.
- */
-static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start,
- int nr_vecs)
-{
- struct bio *bio;
-
- bio = bio_alloc(GFP_NOIO, nr_vecs);
- if (likely(bio)) {
- bio_set_dev(bio, nilfs->ns_bdev);
- bio->bi_iter.bi_sector =
- start << (nilfs->ns_blocksize_bits - 9);
- }
- return bio;
}
static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
@@ -407,17 +366,17 @@ static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf,
struct nilfs_write_info *wi,
- struct buffer_head *bh, int mode)
+ struct buffer_head *bh)
{
int len, err;
BUG_ON(wi->nr_vecs <= 0);
repeat:
if (!wi->bio) {
- wi->bio = nilfs_alloc_seg_bio(wi->nilfs, wi->blocknr + wi->end,
- wi->nr_vecs);
- if (unlikely(!wi->bio))
- return -ENOMEM;
+ wi->bio = bio_alloc(wi->nilfs->ns_bdev, wi->nr_vecs,
+ REQ_OP_WRITE, GFP_NOIO);
+ wi->bio->bi_iter.bi_sector = (wi->blocknr + wi->end) <<
+ (wi->nilfs->ns_blocksize_bits - 9);
}
len = bio_add_page(wi->bio, bh->b_page, bh->b_size, bh_offset(bh));
@@ -426,7 +385,7 @@ static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf,
return 0;
}
/* bio is FULL */
- err = nilfs_segbuf_submit_bio(segbuf, wi, mode, 0);
+ err = nilfs_segbuf_submit_bio(segbuf, wi);
/* never submit current bh */
if (likely(!err))
goto repeat;
@@ -456,13 +415,13 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
nilfs_segbuf_prepare_write(segbuf, &wi);
list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) {
- res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, REQ_OP_WRITE);
+ res = nilfs_segbuf_submit_bh(segbuf, &wi, bh);
if (unlikely(res))
goto failed_bio;
}
list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
- res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, REQ_OP_WRITE);
+ res = nilfs_segbuf_submit_bh(segbuf, &wi, bh);
if (unlikely(res))
goto failed_bio;
}
@@ -472,8 +431,8 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
* Last BIO is always sent through the following
* submission.
*/
- res = nilfs_segbuf_submit_bio(segbuf, &wi, REQ_OP_WRITE,
- REQ_SYNC);
+ wi.bio->bi_opf |= REQ_SYNC;
+ res = nilfs_segbuf_submit_bio(segbuf, &wi);
}
failed_bio:
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 85a853334771..0afe0832c754 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -733,15 +733,18 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode,
struct list_head *listp)
{
struct nilfs_inode_info *ii = NILFS_I(inode);
- struct address_space *mapping = &ii->i_btnode_cache;
+ struct inode *btnc_inode = ii->i_assoc_inode;
struct pagevec pvec;
struct buffer_head *bh, *head;
unsigned int i;
pgoff_t index = 0;
+ if (!btnc_inode)
+ return;
+
pagevec_init(&pvec);
- while (pagevec_lookup_tag(&pvec, mapping, &index,
+ while (pagevec_lookup_tag(&pvec, btnc_inode->i_mapping, &index,
PAGECACHE_TAG_DIRTY)) {
for (i = 0; i < pagevec_count(&pvec); i++) {
bh = head = page_buffers(pvec.pages[i]);
@@ -2410,7 +2413,7 @@ nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head)
continue;
list_del_init(&ii->i_dirty);
truncate_inode_pages(&ii->vfs_inode.i_data, 0);
- nilfs_btnode_cache_clear(&ii->i_btnode_cache);
+ nilfs_btnode_cache_clear(ii->i_assoc_inode->i_mapping);
iput(&ii->vfs_inode);
}
}
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 63e5fa74016c..ba108f915391 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -151,13 +151,14 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
{
struct nilfs_inode_info *ii;
- ii = kmem_cache_alloc(nilfs_inode_cachep, GFP_NOFS);
+ ii = alloc_inode_sb(sb, nilfs_inode_cachep, GFP_NOFS);
if (!ii)
return NULL;
ii->i_bh = NULL;
ii->i_state = 0;
ii->i_cno = 0;
- nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode);
+ ii->i_assoc_inode = NULL;
+ ii->i_bmap = &ii->i_bmap_data;
return &ii->vfs_inode;
}
@@ -1377,8 +1378,6 @@ static void nilfs_inode_init_once(void *obj)
#ifdef CONFIG_NILFS_XATTR
init_rwsem(&ii->xattr_sem);
#endif
- address_space_init_once(&ii->i_btnode_cache);
- ii->i_bmap = &ii->i_bmap_data;
inode_init_once(&ii->vfs_inode);
}
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 2ff6bd85ba8f..9b32b76a9c30 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1003,17 +1003,18 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
__u32 mask, unsigned int flags,
__u32 umask, int *destroy)
{
- __u32 oldmask = 0;
+ __u32 oldmask, newmask;
/* umask bits cannot be removed by user */
mask &= ~umask;
spin_lock(&fsn_mark->lock);
+ oldmask = fsnotify_calc_mask(fsn_mark);
if (!(flags & FAN_MARK_IGNORED_MASK)) {
- oldmask = fsn_mark->mask;
fsn_mark->mask &= ~mask;
} else {
fsn_mark->ignored_mask &= ~mask;
}
+ newmask = fsnotify_calc_mask(fsn_mark);
/*
* We need to keep the mark around even if remaining mask cannot
* result in any events (e.g. mask == FAN_ONDIR) to support incremenal
@@ -1023,7 +1024,7 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
*destroy = !((fsn_mark->mask | fsn_mark->ignored_mask) & ~umask);
spin_unlock(&fsn_mark->lock);
- return mask & oldmask;
+ return oldmask & ~newmask;
}
static int fanotify_remove_mark(struct fsnotify_group *group,
@@ -1080,24 +1081,42 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
flags, umask);
}
+static void fanotify_mark_add_ignored_mask(struct fsnotify_mark *fsn_mark,
+ __u32 mask, unsigned int flags,
+ __u32 *removed)
+{
+ fsn_mark->ignored_mask |= mask;
+
+ /*
+ * Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to
+ * the removal of the FS_MODIFY bit in calculated mask if it was set
+ * because of an ignored mask that is now going to survive FS_MODIFY.
+ */
+ if ((flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
+ !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) {
+ fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
+ if (!(fsn_mark->mask & FS_MODIFY))
+ *removed = FS_MODIFY;
+ }
+}
+
static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
- __u32 mask,
- unsigned int flags)
+ __u32 mask, unsigned int flags,
+ __u32 *removed)
{
- __u32 oldmask = -1;
+ __u32 oldmask, newmask;
spin_lock(&fsn_mark->lock);
+ oldmask = fsnotify_calc_mask(fsn_mark);
if (!(flags & FAN_MARK_IGNORED_MASK)) {
- oldmask = fsn_mark->mask;
fsn_mark->mask |= mask;
} else {
- fsn_mark->ignored_mask |= mask;
- if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
- fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
+ fanotify_mark_add_ignored_mask(fsn_mark, mask, flags, removed);
}
+ newmask = fsnotify_calc_mask(fsn_mark);
spin_unlock(&fsn_mark->lock);
- return mask & ~oldmask;
+ return newmask & ~oldmask;
}
static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
@@ -1155,7 +1174,7 @@ static int fanotify_add_mark(struct fsnotify_group *group,
__kernel_fsid_t *fsid)
{
struct fsnotify_mark *fsn_mark;
- __u32 added;
+ __u32 added, removed = 0;
int ret = 0;
mutex_lock(&group->mark_mutex);
@@ -1178,8 +1197,8 @@ static int fanotify_add_mark(struct fsnotify_group *group,
goto out;
}
- added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
- if (added & ~fsnotify_conn_mask(fsn_mark->connector))
+ added = fanotify_mark_add_to_mask(fsn_mark, mask, flags, &removed);
+ if (removed || (added & ~fsnotify_conn_mask(fsn_mark->connector)))
fsnotify_recalc_mask(fsn_mark->connector);
out:
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index ab81a0776ece..70a8516b78bc 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -70,8 +70,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb)
spin_unlock(&inode->i_lock);
spin_unlock(&sb->s_inode_list_lock);
- if (iput_inode)
- iput(iput_inode);
+ iput(iput_inode);
/* for each watch, send FS_UNMOUNT and then remove it */
fsnotify_inode(inode, FS_UNMOUNT);
@@ -85,8 +84,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb)
}
spin_unlock(&sb->s_inode_list_lock);
- if (iput_inode)
- iput(iput_inode);
+ iput(iput_inode);
}
void fsnotify_sb_delete(struct super_block *sb)
@@ -531,11 +529,13 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
/*
- * if this is a modify event we may need to clear the ignored masks
- * otherwise return if none of the marks care about this type of event.
+ * If this is a modify event we may need to clear some ignored masks.
+ * In that case, the object with ignored masks will have the FS_MODIFY
+ * event in its mask.
+ * Otherwise, return if none of the marks care about this type of event.
*/
test_mask = (mask & ALL_FSNOTIFY_EVENTS);
- if (!(mask & FS_MODIFY) && !(test_mask & marks_mask))
+ if (!(test_mask & marks_mask))
return 0;
iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 9007d6affff3..4853184f7dde 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -127,7 +127,7 @@ static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
return;
hlist_for_each_entry(mark, &conn->list, obj_list) {
if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)
- new_mask |= mark->mask;
+ new_mask |= fsnotify_calc_mask(mark);
}
*fsnotify_conn_mask_p(conn) = new_mask;
}
@@ -692,7 +692,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
if (ret)
goto err;
- if (mark->mask)
+ if (mark->mask || mark->ignored_mask)
fsnotify_recalc_mask(mark->connector);
return ret;
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index bb0a43860ad2..90e3dad8ee45 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -593,12 +593,12 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
iblock = initialized_size >> blocksize_bits;
/*
- * Be very careful. We have no exclusion from __set_page_dirty_buffers
+ * Be very careful. We have no exclusion from block_dirty_folio
* here, and the (potentially unmapped) buffers may become dirty at
* any time. If a buffer becomes dirty here after we've inspected it
* then we just miss that fact, and the page stays dirty.
*
- * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
+ * Buffers outside i_size may be dirtied by block_dirty_folio;
* handle that here by just cleaning them.
*/
@@ -653,7 +653,7 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
// Update initialized size in the attribute and
// in the inode.
// Again, for each page do:
- // __set_page_dirty_buffers();
+ // block_dirty_folio();
// put_page()
// We don't need to wait on the writes.
// Update iblock.
@@ -1350,12 +1350,13 @@ retry_writepage:
/* Is the page fully outside i_size? (truncate in progress) */
if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >>
PAGE_SHIFT)) {
+ struct folio *folio = page_folio(page);
/*
* The page may have dirty, unmapped buffers. Make them
* freeable here, so the page does not leak.
*/
- block_invalidatepage(page, 0, PAGE_SIZE);
- unlock_page(page);
+ block_invalidate_folio(folio, 0, folio_size(folio));
+ folio_unlock(folio);
ntfs_debug("Write outside i_size - truncated?");
return 0;
}
@@ -1653,7 +1654,7 @@ const struct address_space_operations ntfs_normal_aops = {
.readpage = ntfs_readpage,
#ifdef NTFS_RW
.writepage = ntfs_writepage,
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
#endif /* NTFS_RW */
.bmap = ntfs_bmap,
.migratepage = buffer_migrate_page,
@@ -1668,7 +1669,7 @@ const struct address_space_operations ntfs_compressed_aops = {
.readpage = ntfs_readpage,
#ifdef NTFS_RW
.writepage = ntfs_writepage,
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
#endif /* NTFS_RW */
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
@@ -1683,9 +1684,7 @@ const struct address_space_operations ntfs_mst_aops = {
.readpage = ntfs_readpage, /* Fill page with data. */
#ifdef NTFS_RW
.writepage = ntfs_writepage, /* Write dirty page to disk. */
- .set_page_dirty = __set_page_dirty_nobuffers, /* Set the page dirty
- without touching the buffers
- belonging to the page. */
+ .dirty_folio = filemap_dirty_folio,
#endif /* NTFS_RW */
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
@@ -1747,7 +1746,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
set_buffer_dirty(bh);
} while ((bh = bh->b_this_page) != head);
spin_unlock(&mapping->private_lock);
- __set_page_dirty_nobuffers(page);
+ filemap_dirty_folio(mapping, page_folio(page));
if (unlikely(buffers_to_free)) {
do {
bh = buffers_to_free->b_this_page;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 4474adb393ca..efe0602b4e51 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -310,7 +310,7 @@ struct inode *ntfs_alloc_big_inode(struct super_block *sb)
ntfs_inode *ni;
ntfs_debug("Entering.");
- ni = kmem_cache_alloc(ntfs_big_inode_cache, GFP_NOFS);
+ ni = alloc_inode_sb(sb, ntfs_big_inode_cache, GFP_NOFS);
if (likely(ni != NULL)) {
ni->state = 0;
return VFS_I(ni);
@@ -1881,6 +1881,10 @@ int ntfs_read_inode_mount(struct inode *vi)
}
/* Now allocate memory for the attribute list. */
ni->attr_list_size = (u32)ntfs_attr_size(a);
+ if (!ni->attr_list_size) {
+ ntfs_error(sb, "Attr_list_size is zero");
+ goto put_err_out;
+ }
ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size);
if (!ni->attr_list) {
ntfs_error(sb, "Not enough memory to allocate buffer "
diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c
index 4de9acb16968..3de5700a9b83 100644
--- a/fs/ntfs3/fsntfs.c
+++ b/fs/ntfs3/fsntfs.c
@@ -1443,17 +1443,6 @@ int ntfs_write_bh(struct ntfs_sb_info *sbi, struct NTFS_RECORD_HEADER *rhdr,
return err;
}
-static inline struct bio *ntfs_alloc_bio(u32 nr_vecs)
-{
- struct bio *bio = bio_alloc(GFP_NOFS | __GFP_HIGH, nr_vecs);
-
- if (!bio && (current->flags & PF_MEMALLOC)) {
- while (!bio && (nr_vecs /= 2))
- bio = bio_alloc(GFP_NOFS | __GFP_HIGH, nr_vecs);
- }
- return bio;
-}
-
/*
* ntfs_bio_pages - Read/write pages from/to disk.
*/
@@ -1496,19 +1485,13 @@ int ntfs_bio_pages(struct ntfs_sb_info *sbi, const struct runs_tree *run,
lbo = ((u64)lcn << cluster_bits) + off;
len = ((u64)clen << cluster_bits) - off;
new_bio:
- new = ntfs_alloc_bio(nr_pages - page_idx);
- if (!new) {
- err = -ENOMEM;
- goto out;
- }
+ new = bio_alloc(bdev, nr_pages - page_idx, op, GFP_NOFS);
if (bio) {
bio_chain(bio, new);
submit_bio(bio);
}
bio = new;
- bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = lbo >> 9;
- bio->bi_opf = op;
while (len) {
off = vbo & (PAGE_SIZE - 1);
@@ -1599,18 +1582,12 @@ int ntfs_bio_fill_1(struct ntfs_sb_info *sbi, const struct runs_tree *run)
lbo = (u64)lcn << cluster_bits;
len = (u64)clen << cluster_bits;
new_bio:
- new = ntfs_alloc_bio(BIO_MAX_VECS);
- if (!new) {
- err = -ENOMEM;
- break;
- }
+ new = bio_alloc(bdev, BIO_MAX_VECS, REQ_OP_WRITE, GFP_NOFS);
if (bio) {
bio_chain(bio, new);
submit_bio(bio);
}
bio = new;
- bio_set_dev(bio, bdev);
- bio->bi_opf = REQ_OP_WRITE;
bio->bi_iter.bi_sector = lbo >> 9;
for (;;) {
@@ -1626,11 +1603,10 @@ new_bio:
}
} while (run_get_entry(run, ++run_idx, NULL, &lcn, &clen));
- if (bio) {
- if (!err)
- err = submit_bio_wait(bio);
- bio_put(bio);
- }
+ if (!err)
+ err = submit_bio_wait(bio);
+ bio_put(bio);
+
blk_finish_plug(&plug);
out:
unlock_page(fill);
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index a87ab3ad3cd3..9eab11e3b034 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -1950,7 +1950,7 @@ const struct address_space_operations ntfs_aops = {
.write_end = ntfs_write_end,
.direct_IO = ntfs_direct_IO,
.bmap = ntfs_bmap,
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
};
const struct address_space_operations ntfs_aops_cmpr = {
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index 29813200c7af..278dcf502410 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -399,7 +399,7 @@ static struct kmem_cache *ntfs_inode_cachep;
static struct inode *ntfs_alloc_inode(struct super_block *sb)
{
- struct ntfs_inode *ni = kmem_cache_alloc(ntfs_inode_cachep, GFP_NOFS);
+ struct ntfs_inode *ni = alloc_inode_sb(sb, ntfs_inode_cachep, GFP_NOFS);
if (!ni)
return NULL;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index bf9357123bc5..49f41074baad 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5981,7 +5981,7 @@ bail:
return status;
}
-/* Expects you to already be holding tl_inode->i_mutex */
+/* Expects you to already be holding tl_inode->i_rwsem */
int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
{
int status;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 498da317580a..4b9af65cb61b 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2311,7 +2311,7 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
down_write(&oi->ip_alloc_sem);
- /* Delete orphan before acquire i_mutex. */
+ /* Delete orphan before acquire i_rwsem. */
if (dwc->dw_orphaned) {
BUG_ON(dwc->dw_writer_pid != task_pid_nr(current));
@@ -2453,7 +2453,7 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
}
const struct address_space_operations ocfs2_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
.readpage = ocfs2_readpage,
.readahead = ocfs2_readahead,
.writepage = ocfs2_writepage,
@@ -2461,7 +2461,7 @@ const struct address_space_operations ocfs2_aops = {
.write_end = ocfs2_write_end,
.bmap = ocfs2_bmap,
.direct_IO = ocfs2_direct_IO,
- .invalidatepage = block_invalidatepage,
+ .invalidate_folio = block_invalidate_folio,
.releasepage = ocfs2_releasepage,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index a17be1618bf7..ea0e70c0fce0 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -518,7 +518,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
* GFP_KERNEL that the local node can get fenced. It would be
* nicest if we could pre-allocate these bios and avoid this
* all together. */
- bio = bio_alloc(GFP_ATOMIC, 16);
+ bio = bio_alloc(reg->hr_bdev, 16, op | op_flags, GFP_ATOMIC);
if (!bio) {
mlog(ML_ERROR, "Could not alloc slots BIO!\n");
bio = ERR_PTR(-ENOMEM);
@@ -527,10 +527,8 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
/* Must put everything in 512 byte sectors for the bio... */
bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9);
- bio_set_dev(bio, reg->hr_bdev);
bio->bi_private = wc;
bio->bi_end_io = o2hb_bio_end_io;
- bio_set_op_attrs(bio, op, op_flags);
vec_start = (cs << bits) % PAGE_SIZE;
while(cs < max_slots) {
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 625c92521416..27fee68f860a 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -689,7 +689,7 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g
struct o2nm_node_group *ns = NULL;
struct config_group *o2hb_group = NULL, *ret = NULL;
- /* this runs under the parent dir's i_mutex; there can be only
+ /* this runs under the parent dir's i_rwsem; there can be only
* one caller in here at a time */
if (o2nm_single_cluster)
return ERR_PTR(-ENOSPC);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index f2cc1ff29e6d..81c3d65d68fe 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1957,7 +1957,7 @@ bail_nolock:
}
/*
- * NOTE: this should always be called with parent dir i_mutex taken.
+ * NOTE: this should always be called with parent dir i_rwsem taken.
*/
int ocfs2_find_files_on_disk(const char *name,
int namelen,
@@ -2003,7 +2003,7 @@ int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
* Return 0 if the name does not exist
* Return -EEXIST if the directory contains the name
*
- * Callers should have i_mutex + a cluster lock on dir
+ * Callers should have i_rwsem + a cluster lock on dir
*/
int ocfs2_check_dir_for_entry(struct inode *dir,
const char *name,
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index fa0a14f199eb..e360543ad7e7 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -280,7 +280,7 @@ static struct inode *dlmfs_alloc_inode(struct super_block *sb)
{
struct dlmfs_inode_private *ip;
- ip = kmem_cache_alloc(dlmfs_inode_cache, GFP_NOFS);
+ ip = alloc_inode_sb(sb, dlmfs_inode_cache, GFP_NOFS);
if (!ip)
return NULL;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index fc5f780fa235..01b7407a8893 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -270,7 +270,7 @@ int ocfs2_update_inode_atime(struct inode *inode,
/*
* Don't use ocfs2_mark_inode_dirty() here as we don't always
- * have i_mutex to guard against concurrent changes to other
+ * have i_rwsem to guard against concurrent changes to other
* inode fields.
*/
inode->i_atime = current_time(inode);
@@ -540,15 +540,12 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
struct ocfs2_alloc_context *meta_ac,
enum ocfs2_alloc_restarted *reason_ret)
{
- int ret;
struct ocfs2_extent_tree et;
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh);
- ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
- clusters_to_add, mark_unwritten,
- data_ac, meta_ac, reason_ret);
-
- return ret;
+ return ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
+ clusters_to_add, mark_unwritten,
+ data_ac, meta_ac, reason_ret);
}
static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
@@ -1068,7 +1065,7 @@ static int ocfs2_extend_file(struct inode *inode,
/*
* The alloc sem blocks people in read/write from reading our
* allocation until we're done changing it. We depend on
- * i_mutex to block other extend/truncate calls while we're
+ * i_rwsem to block other extend/truncate calls while we're
* here. We even have to hold it for sparse files because there
* might be some tail zeroing.
*/
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 6c2411c2afcf..5739dc301569 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -713,7 +713,7 @@ bail:
/*
* Serialize with orphan dir recovery. If the process doing
* recovery on this orphan dir does an iget() with the dir
- * i_mutex held, we'll deadlock here. Instead we detect this
+ * i_rwsem held, we'll deadlock here. Instead we detect this
* and exit early - recovery will wipe this inode for us.
*/
static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 5f6bacbeef6b..c4426d12a2ad 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -606,7 +606,7 @@ out:
/*
* make sure we've got at least bits_wanted contiguous bits in the
- * local alloc. You lose them when you drop i_mutex.
+ * local alloc. You lose them when you drop i_rwsem.
*
* We will add ourselves to the transaction passed in, but may start
* our own in order to shift windows.
@@ -636,7 +636,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
/*
* We must double check state and allocator bits because
- * another process may have changed them while holding i_mutex.
+ * another process may have changed them while holding i_rwsem.
*/
spin_lock(&osb->osb_lock);
if (!ocfs2_la_state_enabled(osb) ||
@@ -1029,7 +1029,7 @@ enum ocfs2_la_event {
/*
* Given an event, calculate the size of our next local alloc window.
*
- * This should always be called under i_mutex of the local alloc inode
+ * This should always be called under i_rwsem of the local alloc inode
* so that local alloc disabling doesn't race with processes trying to
* use the allocator.
*
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2c46ff6ba4ea..c75fd54b9185 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -476,7 +476,7 @@ leave:
ocfs2_free_alloc_context(meta_ac);
/*
- * We should call iput after the i_mutex of the bitmap been
+ * We should call iput after the i_rwsem of the bitmap been
* unlocked in ocfs2_free_alloc_context, or the
* ocfs2_delete_inode will mutex_lock again.
*/
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index bb62cc2e0211..337527571461 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -355,7 +355,7 @@ struct ocfs2_super
struct delayed_work la_enable_wq;
/*
- * Must hold local alloc i_mutex and osb->osb_lock to change
+ * Must hold local alloc i_rwsem and osb->osb_lock to change
* local_alloc_bits. Reads can be done under either lock.
*/
unsigned int local_alloc_bits;
@@ -430,7 +430,7 @@ struct ocfs2_super
atomic_t osb_tl_disable;
/*
* How many clusters in our truncate log.
- * It must be protected by osb_tl_inode->i_mutex.
+ * It must be protected by osb_tl_inode->i_rwsem.
*/
unsigned int truncated_clusters;
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index f033de733adb..0b6f551a342a 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -36,7 +36,7 @@
* should be obeyed by all the functions:
* - any write of quota structure (either to local or global file) is protected
* by dqio_sem or dquot->dq_lock.
- * - any modification of global quota file holds inode cluster lock, i_mutex,
+ * - any modification of global quota file holds inode cluster lock, i_rwsem,
* and ip_alloc_sem of the global quota file (achieved by
* ocfs2_lock_global_qf). It also has to hold qinfo_lock.
* - an allocation of new blocks for local quota file is protected by
@@ -337,7 +337,6 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
/* Read information header from global quota file */
int ocfs2_global_read_info(struct super_block *sb, int type)
{
- struct inode *gqinode = NULL;
unsigned int ino[OCFS2_MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
GROUP_QUOTA_SYSTEM_INODE };
struct ocfs2_global_disk_dqinfo dinfo;
@@ -346,29 +345,31 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
u64 pcount;
int status;
+ oinfo->dqi_gi.dqi_sb = sb;
+ oinfo->dqi_gi.dqi_type = type;
+ ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
+ oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk);
+ oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops;
+ oinfo->dqi_gqi_bh = NULL;
+ oinfo->dqi_gqi_count = 0;
+
/* Read global header */
- gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
+ oinfo->dqi_gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
OCFS2_INVALID_SLOT);
- if (!gqinode) {
+ if (!oinfo->dqi_gqinode) {
mlog(ML_ERROR, "failed to get global quota inode (type=%d)\n",
type);
status = -EINVAL;
goto out_err;
}
- oinfo->dqi_gi.dqi_sb = sb;
- oinfo->dqi_gi.dqi_type = type;
- oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk);
- oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops;
- oinfo->dqi_gqi_bh = NULL;
- oinfo->dqi_gqi_count = 0;
- oinfo->dqi_gqinode = gqinode;
+
status = ocfs2_lock_global_qf(oinfo, 0);
if (status < 0) {
mlog_errno(status);
goto out_err;
}
- status = ocfs2_extent_map_get_blocks(gqinode, 0, &oinfo->dqi_giblk,
+ status = ocfs2_extent_map_get_blocks(oinfo->dqi_gqinode, 0, &oinfo->dqi_giblk,
&pcount, NULL);
if (status < 0)
goto out_unlock;
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 0e4b16d4c037..b1a8b046f4c2 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -702,8 +702,6 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
info->dqi_priv = oinfo;
oinfo->dqi_type = type;
INIT_LIST_HEAD(&oinfo->dqi_chunk);
- oinfo->dqi_gqinode = NULL;
- ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
oinfo->dqi_rec = NULL;
oinfo->dqi_lqi_bh = NULL;
oinfo->dqi_libh = NULL;
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 85a47621e0c0..a75e2b7d67f5 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -683,28 +683,22 @@ static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
void *name,
unsigned int namelen)
{
- int ret;
-
if (!lksb->lksb_fsdlm.sb_lvbptr)
lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
sizeof(struct dlm_lksb);
- ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
- flags|DLM_LKF_NODLCKWT, name, namelen, 0,
- fsdlm_lock_ast_wrapper, lksb,
- fsdlm_blocking_ast_wrapper);
- return ret;
+ return dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
+ flags|DLM_LKF_NODLCKWT, name, namelen, 0,
+ fsdlm_lock_ast_wrapper, lksb,
+ fsdlm_blocking_ast_wrapper);
}
static int user_dlm_unlock(struct ocfs2_cluster_connection *conn,
struct ocfs2_dlm_lksb *lksb,
u32 flags)
{
- int ret;
-
- ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
- flags, &lksb->lksb_fsdlm, lksb);
- return ret;
+ return dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
+ flags, &lksb->lksb_fsdlm, lksb);
}
static int user_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 2772dec9dcea..477cdf94122e 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -548,7 +548,7 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
{
struct ocfs2_inode_info *oi;
- oi = kmem_cache_alloc(ocfs2_inode_cachep, GFP_NOFS);
+ oi = alloc_inode_sb(sb, ocfs2_inode_cachep, GFP_NOFS);
if (!oi)
return NULL;
@@ -1105,17 +1105,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
goto read_super_error;
}
- root = d_make_root(inode);
- if (!root) {
- status = -ENOMEM;
- mlog_errno(status);
- goto read_super_error;
- }
-
- sb->s_root = root;
-
- ocfs2_complete_mount_recovery(osb);
-
osb->osb_dev_kset = kset_create_and_add(sb->s_id, NULL,
&ocfs2_kset->kobj);
if (!osb->osb_dev_kset) {
@@ -1133,6 +1122,17 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
goto read_super_error;
}
+ root = d_make_root(inode);
+ if (!root) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto read_super_error;
+ }
+
+ sb->s_root = root;
+
+ ocfs2_complete_mount_recovery(osb);
+
if (ocfs2_mount_local(osb))
snprintf(nodestr, sizeof(nodestr), "local");
else
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index dd784eb0cd7c..95d0611c5fc7 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7205,7 +7205,7 @@ out:
* Used for reflink a non-preserve-security file.
*
* It uses common api like ocfs2_xattr_set, so the caller
- * must not hold any lock expect i_mutex.
+ * must not hold any lock expect i_rwsem.
*/
int ocfs2_init_security_and_acl(struct inode *dir,
struct inode *inode,
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 89725b15a64b..3f297b541713 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -372,7 +372,8 @@ const struct inode_operations omfs_file_inops = {
};
const struct address_space_operations omfs_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = omfs_readpage,
.readahead = omfs_readahead,
.writepage = omfs_writepage,
diff --git a/fs/open.c b/fs/open.c
index 9ff2f621b760..1315253e0247 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -835,7 +835,6 @@ static int do_dentry_open(struct file *f,
likely(f->f_op->write || f->f_op->write_iter))
f->f_mode |= FMODE_CAN_WRITE;
- f->f_write_hint = WRITE_LIFE_NOT_SET;
f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index f825176ff4ed..f0b7f4d51a17 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -335,7 +335,7 @@ static struct inode *openprom_alloc_inode(struct super_block *sb)
{
struct op_inode_info *oi;
- oi = kmem_cache_alloc(op_inode_cachep, GFP_KERNEL);
+ oi = alloc_inode_sb(sb, op_inode_cachep, GFP_KERNEL);
if (!oi)
return NULL;
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index e5e3e500ed46..79c1025d18ea 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -46,7 +46,7 @@ static int orangefs_writepage_locked(struct page *page,
else
wlen = PAGE_SIZE;
}
- /* Should've been handled in orangefs_invalidatepage. */
+ /* Should've been handled in orangefs_invalidate_folio. */
WARN_ON(off == len || off + wlen > len);
bv.bv_page = page;
@@ -243,7 +243,7 @@ static int orangefs_writepages(struct address_space *mapping,
return ret;
}
-static int orangefs_launder_page(struct page *);
+static int orangefs_launder_folio(struct folio *);
static void orangefs_readahead(struct readahead_control *rac)
{
@@ -290,14 +290,15 @@ static void orangefs_readahead(struct readahead_control *rac)
static int orangefs_readpage(struct file *file, struct page *page)
{
+ struct folio *folio = page_folio(page);
struct inode *inode = page->mapping->host;
struct iov_iter iter;
struct bio_vec bv;
ssize_t ret;
loff_t off; /* offset into this page */
- if (PageDirty(page))
- orangefs_launder_page(page);
+ if (folio_test_dirty(folio))
+ orangefs_launder_folio(folio);
off = page_offset(page);
bv.bv_page = page;
@@ -330,6 +331,7 @@ static int orangefs_write_begin(struct file *file,
void **fsdata)
{
struct orangefs_write_range *wr;
+ struct folio *folio;
struct page *page;
pgoff_t index;
int ret;
@@ -341,27 +343,28 @@ static int orangefs_write_begin(struct file *file,
return -ENOMEM;
*pagep = page;
+ folio = page_folio(page);
- if (PageDirty(page) && !PagePrivate(page)) {
+ if (folio_test_dirty(folio) && !folio_test_private(folio)) {
/*
* Should be impossible. If it happens, launder the page
* since we don't know what's dirty. This will WARN in
* orangefs_writepage_locked.
*/
- ret = orangefs_launder_page(page);
+ ret = orangefs_launder_folio(folio);
if (ret)
return ret;
}
- if (PagePrivate(page)) {
+ if (folio_test_private(folio)) {
struct orangefs_write_range *wr;
- wr = (struct orangefs_write_range *)page_private(page);
+ wr = folio_get_private(folio);
if (wr->pos + wr->len == pos &&
uid_eq(wr->uid, current_fsuid()) &&
gid_eq(wr->gid, current_fsgid())) {
wr->len += len;
goto okay;
} else {
- ret = orangefs_launder_page(page);
+ ret = orangefs_launder_folio(folio);
if (ret)
return ret;
}
@@ -375,7 +378,7 @@ static int orangefs_write_begin(struct file *file,
wr->len = len;
wr->uid = current_fsuid();
wr->gid = current_fsgid();
- attach_page_private(page, wr);
+ folio_attach_private(folio, wr);
okay:
return 0;
}
@@ -415,47 +418,45 @@ static int orangefs_write_end(struct file *file, struct address_space *mapping,
return copied;
}
-static void orangefs_invalidatepage(struct page *page,
- unsigned int offset,
- unsigned int length)
+static void orangefs_invalidate_folio(struct folio *folio,
+ size_t offset, size_t length)
{
- struct orangefs_write_range *wr;
- wr = (struct orangefs_write_range *)page_private(page);
+ struct orangefs_write_range *wr = folio_get_private(folio);
if (offset == 0 && length == PAGE_SIZE) {
- kfree(detach_page_private(page));
+ kfree(folio_detach_private(folio));
return;
/* write range entirely within invalidate range (or equal) */
- } else if (page_offset(page) + offset <= wr->pos &&
- wr->pos + wr->len <= page_offset(page) + offset + length) {
- kfree(detach_page_private(page));
+ } else if (folio_pos(folio) + offset <= wr->pos &&
+ wr->pos + wr->len <= folio_pos(folio) + offset + length) {
+ kfree(folio_detach_private(folio));
/* XXX is this right? only caller in fs */
- cancel_dirty_page(page);
+ folio_cancel_dirty(folio);
return;
/* invalidate range chops off end of write range */
- } else if (wr->pos < page_offset(page) + offset &&
- wr->pos + wr->len <= page_offset(page) + offset + length &&
- page_offset(page) + offset < wr->pos + wr->len) {
+ } else if (wr->pos < folio_pos(folio) + offset &&
+ wr->pos + wr->len <= folio_pos(folio) + offset + length &&
+ folio_pos(folio) + offset < wr->pos + wr->len) {
size_t x;
- x = wr->pos + wr->len - (page_offset(page) + offset);
+ x = wr->pos + wr->len - (folio_pos(folio) + offset);
WARN_ON(x > wr->len);
wr->len -= x;
wr->uid = current_fsuid();
wr->gid = current_fsgid();
/* invalidate range chops off beginning of write range */
- } else if (page_offset(page) + offset <= wr->pos &&
- page_offset(page) + offset + length < wr->pos + wr->len &&
- wr->pos < page_offset(page) + offset + length) {
+ } else if (folio_pos(folio) + offset <= wr->pos &&
+ folio_pos(folio) + offset + length < wr->pos + wr->len &&
+ wr->pos < folio_pos(folio) + offset + length) {
size_t x;
- x = page_offset(page) + offset + length - wr->pos;
+ x = folio_pos(folio) + offset + length - wr->pos;
WARN_ON(x > wr->len);
wr->pos += x;
wr->len -= x;
wr->uid = current_fsuid();
wr->gid = current_fsgid();
/* invalidate range entirely within write range (punch hole) */
- } else if (wr->pos < page_offset(page) + offset &&
- page_offset(page) + offset + length < wr->pos + wr->len) {
+ } else if (wr->pos < folio_pos(folio) + offset &&
+ folio_pos(folio) + offset + length < wr->pos + wr->len) {
/* XXX what do we do here... should not WARN_ON */
WARN_ON(1);
/* punch hole */
@@ -467,11 +468,11 @@ static void orangefs_invalidatepage(struct page *page,
/* non-overlapping ranges */
} else {
/* WARN if they do overlap */
- if (!((page_offset(page) + offset + length <= wr->pos) ^
- (wr->pos + wr->len <= page_offset(page) + offset))) {
+ if (!((folio_pos(folio) + offset + length <= wr->pos) ^
+ (wr->pos + wr->len <= folio_pos(folio) + offset))) {
WARN_ON(1);
- printk("invalidate range offset %llu length %u\n",
- page_offset(page) + offset, length);
+ printk("invalidate range offset %llu length %zu\n",
+ folio_pos(folio) + offset, length);
printk("write range offset %llu length %zu\n",
wr->pos, wr->len);
}
@@ -483,7 +484,7 @@ static void orangefs_invalidatepage(struct page *page,
* Thus the following runs if wr was modified above.
*/
- orangefs_launder_page(page);
+ orangefs_launder_folio(folio);
}
static int orangefs_releasepage(struct page *page, gfp_t foo)
@@ -496,17 +497,17 @@ static void orangefs_freepage(struct page *page)
kfree(detach_page_private(page));
}
-static int orangefs_launder_page(struct page *page)
+static int orangefs_launder_folio(struct folio *folio)
{
int r = 0;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = 0,
};
- wait_on_page_writeback(page);
- if (clear_page_dirty_for_io(page)) {
- r = orangefs_writepage_locked(page, &wbc);
- end_page_writeback(page);
+ folio_wait_writeback(folio);
+ if (folio_clear_dirty_for_io(folio)) {
+ r = orangefs_writepage_locked(&folio->page, &wbc);
+ folio_end_writeback(folio);
}
return r;
}
@@ -633,19 +634,19 @@ static const struct address_space_operations orangefs_address_operations = {
.readahead = orangefs_readahead,
.readpage = orangefs_readpage,
.writepages = orangefs_writepages,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .dirty_folio = filemap_dirty_folio,
.write_begin = orangefs_write_begin,
.write_end = orangefs_write_end,
- .invalidatepage = orangefs_invalidatepage,
+ .invalidate_folio = orangefs_invalidate_folio,
.releasepage = orangefs_releasepage,
.freepage = orangefs_freepage,
- .launder_page = orangefs_launder_page,
+ .launder_folio = orangefs_launder_folio,
.direct_IO = orangefs_direct_IO,
};
vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf)
{
- struct page *page = vmf->page;
+ struct folio *folio = page_folio(vmf->page);
struct inode *inode = file_inode(vmf->vma->vm_file);
struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
unsigned long *bitlock = &orangefs_inode->bitlock;
@@ -659,27 +660,27 @@ vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf)
goto out;
}
- lock_page(page);
- if (PageDirty(page) && !PagePrivate(page)) {
+ folio_lock(folio);
+ if (folio_test_dirty(folio) && !folio_test_private(folio)) {
/*
- * Should be impossible. If it happens, launder the page
+ * Should be impossible. If it happens, launder the folio
* since we don't know what's dirty. This will WARN in
* orangefs_writepage_locked.
*/
- if (orangefs_launder_page(page)) {
+ if (orangefs_launder_folio(folio)) {
ret = VM_FAULT_LOCKED|VM_FAULT_RETRY;
goto out;
}
}
- if (PagePrivate(page)) {
- wr = (struct orangefs_write_range *)page_private(page);
+ if (folio_test_private(folio)) {
+ wr = folio_get_private(folio);
if (uid_eq(wr->uid, current_fsuid()) &&
gid_eq(wr->gid, current_fsgid())) {
- wr->pos = page_offset(page);
+ wr->pos = page_offset(vmf->page);
wr->len = PAGE_SIZE;
goto okay;
} else {
- if (orangefs_launder_page(page)) {
+ if (orangefs_launder_folio(folio)) {
ret = VM_FAULT_LOCKED|VM_FAULT_RETRY;
goto out;
}
@@ -690,27 +691,27 @@ vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf)
ret = VM_FAULT_LOCKED|VM_FAULT_RETRY;
goto out;
}
- wr->pos = page_offset(page);
+ wr->pos = page_offset(vmf->page);
wr->len = PAGE_SIZE;
wr->uid = current_fsuid();
wr->gid = current_fsgid();
- attach_page_private(page, wr);
+ folio_attach_private(folio, wr);
okay:
file_update_time(vmf->vma->vm_file);
- if (page->mapping != inode->i_mapping) {
- unlock_page(page);
+ if (folio->mapping != inode->i_mapping) {
+ folio_unlock(folio);
ret = VM_FAULT_LOCKED|VM_FAULT_NOPAGE;
goto out;
}
/*
- * We mark the page dirty already here so that when freeze is in
+ * We mark the folio dirty already here so that when freeze is in
* progress, we are guaranteed that writeback during freezing will
- * see the dirty page and writeprotect it again.
+ * see the dirty folio and writeprotect it again.
*/
- set_page_dirty(page);
- wait_for_stable_page(page);
+ folio_mark_dirty(folio);
+ folio_wait_stable(folio);
ret = VM_FAULT_LOCKED;
out:
sb_end_pagefault(inode->i_sb);
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index d90d8addbfc2..5254256a224d 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -107,7 +107,7 @@ static struct inode *orangefs_alloc_inode(struct super_block *sb)
{
struct orangefs_inode_s *orangefs_inode;
- orangefs_inode = kmem_cache_alloc(orangefs_inode_cache, GFP_KERNEL);
+ orangefs_inode = alloc_inode_sb(sb, orangefs_inode_cache, GFP_KERNEL);
if (!orangefs_inode)
return NULL;
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 7bb0a47cb615..001cdbb8f015 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -174,7 +174,7 @@ static struct kmem_cache *ovl_inode_cachep;
static struct inode *ovl_alloc_inode(struct super_block *sb)
{
- struct ovl_inode *oi = kmem_cache_alloc(ovl_inode_cachep, GFP_KERNEL);
+ struct ovl_inode *oi = alloc_inode_sb(sb, ovl_inode_cachep, GFP_KERNEL);
if (!oi)
return NULL;
diff --git a/fs/pipe.c b/fs/pipe.c
index 2667db9506e2..e140ea150bbb 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -607,7 +607,7 @@ out:
static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct pipe_inode_info *pipe = filp->private_data;
- int count, head, tail, mask;
+ unsigned int count, head, tail, mask;
switch (cmd) {
case FIONREAD:
@@ -829,7 +829,7 @@ out_free_uid:
void free_pipe_info(struct pipe_inode_info *pipe)
{
- int i;
+ unsigned int i;
#ifdef CONFIG_WATCH_QUEUE
if (pipe->watch_queue)
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 80acb6885cf9..962d32468eb4 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -759,9 +759,14 @@ static void posix_acl_fix_xattr_userns(
}
void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns,
+ struct inode *inode,
void *value, size_t size)
{
struct user_namespace *user_ns = current_user_ns();
+
+ /* Leave ids untouched on non-idmapped mounts. */
+ if (no_idmapping(mnt_userns, i_user_ns(inode)))
+ mnt_userns = &init_user_ns;
if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns))
return;
posix_acl_fix_xattr_userns(&init_user_ns, user_ns, mnt_userns, value,
@@ -769,9 +774,14 @@ void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns,
}
void posix_acl_fix_xattr_to_user(struct user_namespace *mnt_userns,
+ struct inode *inode,
void *value, size_t size)
{
struct user_namespace *user_ns = current_user_ns();
+
+ /* Leave ids untouched on non-idmapped mounts. */
+ if (no_idmapping(mnt_userns, i_user_ns(inode)))
+ mnt_userns = &init_user_ns;
if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns))
return;
posix_acl_fix_xattr_userns(user_ns, &init_user_ns, mnt_userns, value,
diff --git a/fs/proc/array.c b/fs/proc/array.c
index fd8b0c12b2cb..eb815759842c 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -88,7 +88,6 @@
#include <linux/pid_namespace.h>
#include <linux/prctl.h>
#include <linux/ptrace.h>
-#include <linux/tracehook.h>
#include <linux/string_helpers.h>
#include <linux/user_namespace.h>
#include <linux/fs_struct.h>
diff --git a/fs/proc/base.c b/fs/proc/base.c
index d654ce7150fd..c1031843cc6a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -74,7 +74,6 @@
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/ptrace.h>
-#include <linux/tracehook.h>
#include <linux/printk.h>
#include <linux/cache.h>
#include <linux/cgroup.h>
@@ -1764,25 +1763,25 @@ out:
static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
{
- char *tmp = (char *)__get_free_page(GFP_KERNEL);
+ char *tmp = kmalloc(PATH_MAX, GFP_KERNEL);
char *pathname;
int len;
if (!tmp)
return -ENOMEM;
- pathname = d_path(path, tmp, PAGE_SIZE);
+ pathname = d_path(path, tmp, PATH_MAX);
len = PTR_ERR(pathname);
if (IS_ERR(pathname))
goto out;
- len = tmp + PAGE_SIZE - 1 - pathname;
+ len = tmp + PATH_MAX - 1 - pathname;
if (len > buflen)
len = buflen;
if (copy_to_user(buffer, pathname, len))
len = -EFAULT;
out:
- free_page((unsigned long)tmp);
+ kfree(tmp);
return len;
}
diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c
index 6d8d4bf20837..2e244ada1f97 100644
--- a/fs/proc/bootconfig.c
+++ b/fs/proc/bootconfig.c
@@ -32,6 +32,8 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size)
int ret = 0;
key = kzalloc(XBC_KEYLEN_MAX, GFP_KERNEL);
+ if (!key)
+ return -ENOMEM;
xbc_for_each_key_value(leaf, val) {
ret = xbc_node_compose_key(leaf, key, XBC_KEYLEN_MAX);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index f84355c5a36d..73aeb4e6d32e 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -66,7 +66,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
{
struct proc_inode *ei;
- ei = kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, proc_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
ei->pid = NULL;
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 9f1077d94cde..a2873a617ae8 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -10,6 +10,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/hugetlb.h>
+#include <linux/memremap.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 702754dd1daf..6f1b8ddc6f7a 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -62,7 +62,8 @@ core_param(novmcoredd, vmcoredd_disabled, bool, 0);
/* Device Dump Size */
static size_t vmcoredd_orig_sz;
-static DECLARE_RWSEM(vmcore_cb_rwsem);
+static DEFINE_SPINLOCK(vmcore_cb_lock);
+DEFINE_STATIC_SRCU(vmcore_cb_srcu);
/* List of registered vmcore callbacks. */
static LIST_HEAD(vmcore_cb_list);
/* Whether the vmcore has been opened once. */
@@ -70,8 +71,8 @@ static bool vmcore_opened;
void register_vmcore_cb(struct vmcore_cb *cb)
{
- down_write(&vmcore_cb_rwsem);
INIT_LIST_HEAD(&cb->next);
+ spin_lock(&vmcore_cb_lock);
list_add_tail(&cb->next, &vmcore_cb_list);
/*
* Registering a vmcore callback after the vmcore was opened is
@@ -79,14 +80,14 @@ void register_vmcore_cb(struct vmcore_cb *cb)
*/
if (vmcore_opened)
pr_warn_once("Unexpected vmcore callback registration\n");
- up_write(&vmcore_cb_rwsem);
+ spin_unlock(&vmcore_cb_lock);
}
EXPORT_SYMBOL_GPL(register_vmcore_cb);
void unregister_vmcore_cb(struct vmcore_cb *cb)
{
- down_write(&vmcore_cb_rwsem);
- list_del(&cb->next);
+ spin_lock(&vmcore_cb_lock);
+ list_del_rcu(&cb->next);
/*
* Unregistering a vmcore callback after the vmcore was opened is
* very unusual (e.g., forced driver removal), but we cannot stop
@@ -94,7 +95,9 @@ void unregister_vmcore_cb(struct vmcore_cb *cb)
*/
if (vmcore_opened)
pr_warn_once("Unexpected vmcore callback unregistration\n");
- up_write(&vmcore_cb_rwsem);
+ spin_unlock(&vmcore_cb_lock);
+
+ synchronize_srcu(&vmcore_cb_srcu);
}
EXPORT_SYMBOL_GPL(unregister_vmcore_cb);
@@ -103,9 +106,8 @@ static bool pfn_is_ram(unsigned long pfn)
struct vmcore_cb *cb;
bool ret = true;
- lockdep_assert_held_read(&vmcore_cb_rwsem);
-
- list_for_each_entry(cb, &vmcore_cb_list, next) {
+ list_for_each_entry_srcu(cb, &vmcore_cb_list, next,
+ srcu_read_lock_held(&vmcore_cb_srcu)) {
if (unlikely(!cb->pfn_is_ram))
continue;
ret = cb->pfn_is_ram(cb, pfn);
@@ -118,9 +120,9 @@ static bool pfn_is_ram(unsigned long pfn)
static int open_vmcore(struct inode *inode, struct file *file)
{
- down_read(&vmcore_cb_rwsem);
+ spin_lock(&vmcore_cb_lock);
vmcore_opened = true;
- up_read(&vmcore_cb_rwsem);
+ spin_unlock(&vmcore_cb_lock);
return 0;
}
@@ -133,6 +135,7 @@ ssize_t read_from_oldmem(char *buf, size_t count,
unsigned long pfn, offset;
size_t nr_bytes;
ssize_t read = 0, tmp;
+ int idx;
if (!count)
return 0;
@@ -140,7 +143,7 @@ ssize_t read_from_oldmem(char *buf, size_t count,
offset = (unsigned long)(*ppos % PAGE_SIZE);
pfn = (unsigned long)(*ppos / PAGE_SIZE);
- down_read(&vmcore_cb_rwsem);
+ idx = srcu_read_lock(&vmcore_cb_srcu);
do {
if (count > (PAGE_SIZE - offset))
nr_bytes = PAGE_SIZE - offset;
@@ -165,7 +168,7 @@ ssize_t read_from_oldmem(char *buf, size_t count,
offset, userbuf);
}
if (tmp < 0) {
- up_read(&vmcore_cb_rwsem);
+ srcu_read_unlock(&vmcore_cb_srcu, idx);
return tmp;
}
@@ -176,8 +179,8 @@ ssize_t read_from_oldmem(char *buf, size_t count,
++pfn;
offset = 0;
} while (count);
+ srcu_read_unlock(&vmcore_cb_srcu, idx);
- up_read(&vmcore_cb_rwsem);
return read;
}
@@ -477,7 +480,7 @@ static const struct vm_operations_struct vmcore_mmap_ops = {
/**
* vmcore_alloc_buf - allocate buffer in vmalloc memory
- * @sizez: size of buffer
+ * @size: size of buffer
*
* If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap
* the buffer to user-space by means of remap_vmalloc_range().
@@ -568,18 +571,18 @@ static int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma,
unsigned long from, unsigned long pfn,
unsigned long size, pgprot_t prot)
{
- int ret;
+ int ret, idx;
/*
- * Check if oldmem_pfn_is_ram was registered to avoid
- * looping over all pages without a reason.
+ * Check if a callback was registered to avoid looping over all
+ * pages without a reason.
*/
- down_read(&vmcore_cb_rwsem);
+ idx = srcu_read_lock(&vmcore_cb_srcu);
if (!list_empty(&vmcore_cb_list))
ret = remap_oldmem_pfn_checked(vma, from, pfn, size, prot);
else
ret = remap_oldmem_pfn_range(vma, from, pfn, size, prot);
- up_read(&vmcore_cb_rwsem);
+ srcu_read_unlock(&vmcore_cb_srcu, idx);
return ret;
}
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index f243cb5e6a4f..e26162f102ff 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -143,21 +143,22 @@ static void pstore_timer_kick(void)
mod_timer(&pstore_timer, jiffies + msecs_to_jiffies(pstore_update_ms));
}
-/*
- * Should pstore_dump() wait for a concurrent pstore_dump()? If
- * not, the current pstore_dump() will report a failure to dump
- * and return.
- */
-static bool pstore_cannot_wait(enum kmsg_dump_reason reason)
+static bool pstore_cannot_block_path(enum kmsg_dump_reason reason)
{
- /* In NMI path, pstore shouldn't block regardless of reason. */
+ /*
+ * In case of NMI path, pstore shouldn't be blocked
+ * regardless of reason.
+ */
if (in_nmi())
return true;
switch (reason) {
/* In panic case, other cpus are stopped by smp_send_stop(). */
case KMSG_DUMP_PANIC:
- /* Emergency restart shouldn't be blocked. */
+ /*
+ * Emergency restart shouldn't be blocked by spinning on
+ * pstore_info::buf_lock.
+ */
case KMSG_DUMP_EMERG:
return true;
default:
@@ -389,21 +390,19 @@ static void pstore_dump(struct kmsg_dumper *dumper,
unsigned long total = 0;
const char *why;
unsigned int part = 1;
+ unsigned long flags = 0;
int ret;
why = kmsg_dump_reason_str(reason);
- if (down_trylock(&psinfo->buf_lock)) {
- /* Failed to acquire lock: give up if we cannot wait. */
- if (pstore_cannot_wait(reason)) {
- pr_err("dump skipped in %s path: may corrupt error record\n",
- in_nmi() ? "NMI" : why);
- return;
- }
- if (down_interruptible(&psinfo->buf_lock)) {
- pr_err("could not grab semaphore?!\n");
+ if (pstore_cannot_block_path(reason)) {
+ if (!spin_trylock_irqsave(&psinfo->buf_lock, flags)) {
+ pr_err("dump skipped in %s path because of concurrent dump\n",
+ in_nmi() ? "NMI" : why);
return;
}
+ } else {
+ spin_lock_irqsave(&psinfo->buf_lock, flags);
}
kmsg_dump_rewind(&iter);
@@ -467,8 +466,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
total += record.size;
part++;
}
-
- up(&psinfo->buf_lock);
+ spin_unlock_irqrestore(&psinfo->buf_lock, flags);
}
static struct kmsg_dumper pstore_dumper = {
@@ -594,7 +592,7 @@ int pstore_register(struct pstore_info *psi)
psi->write_user = pstore_write_user_compat;
psinfo = psi;
mutex_init(&psinfo->read_mutex);
- sema_init(&psinfo->buf_lock, 1);
+ spin_lock_init(&psinfo->buf_lock);
if (psi->flags & PSTORE_FLAGS_DMESG)
allocate_buf_for_compression();
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index fe5305028c6e..a89e33719fcf 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -263,10 +263,10 @@ ssize_t persistent_ram_ecc_string(struct persistent_ram_zone *prz,
if (prz->corrected_bytes || prz->bad_blocks)
ret = snprintf(str, len, ""
- "\n%d Corrected bytes, %d unrecoverable blocks\n",
+ "\nECC: %d Corrected bytes, %d unrecoverable blocks\n",
prz->corrected_bytes, prz->bad_blocks);
else
- ret = snprintf(str, len, "\nNo errors detected\n");
+ ret = snprintf(str, len, "\nECC: No errors detected\n");
return ret;
}
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 3fb7fc819b4f..a635bb6615e9 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -338,7 +338,7 @@ static struct kmem_cache *qnx4_inode_cachep;
static struct inode *qnx4_alloc_inode(struct super_block *sb)
{
struct qnx4_inode_info *ei;
- ei = kmem_cache_alloc(qnx4_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, qnx4_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
return &ei->vfs_inode;
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 61191f7bdf62..9d8e7e9788a1 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -597,7 +597,7 @@ static struct kmem_cache *qnx6_inode_cachep;
static struct inode *qnx6_alloc_inode(struct super_block *sb)
{
struct qnx6_inode_info *ei;
- ei = kmem_cache_alloc(qnx6_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, qnx6_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
return &ei->vfs_inode;
diff --git a/fs/read_write.c b/fs/read_write.c
index dc5000173b80..e643aec2b0ef 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1630,7 +1630,6 @@ int generic_write_checks_count(struct kiocb *iocb, loff_t *count)
if (!*count)
return 0;
- /* FIXME: this is for backwards compatibility with 2.4 */
if (iocb->ki_flags & IOCB_APPEND)
iocb->ki_pos = i_size_read(inode);
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
index 8fd54ed8f844..33c8b0dd07a2 100644
--- a/fs/reiserfs/Kconfig
+++ b/fs/reiserfs/Kconfig
@@ -1,10 +1,14 @@
# SPDX-License-Identifier: GPL-2.0-only
config REISERFS_FS
- tristate "Reiserfs support"
+ tristate "Reiserfs support (deprecated)"
select CRC32
help
- Stores not just filenames but the files themselves in a balanced
- tree. Uses journalling.
+ Reiserfs is deprecated and scheduled to be removed from the kernel
+ in 2025. If you are still using it, please migrate to another
+ filesystem or tell us your usecase for reiserfs.
+
+ Reiserfs stores not just filenames but the files themselves in a
+ balanced tree. Uses journalling.
Balanced trees are more efficient than traditional file system
architectural foundations.
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index f49b72ccac4c..36c59b25486c 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2763,13 +2763,6 @@ static int reiserfs_write_begin(struct file *file,
int old_ref = 0;
inode = mapping->host;
- *fsdata = NULL;
- if (flags & AOP_FLAG_CONT_EXPAND &&
- (pos & (inode->i_sb->s_blocksize - 1)) == 0) {
- pos ++;
- *fsdata = (void *)(unsigned long)flags;
- }
-
index = pos >> PAGE_SHIFT;
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
@@ -2896,9 +2889,6 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
unsigned start;
bool locked = false;
- if ((unsigned long)fsdata & AOP_FLAG_CONT_EXPAND)
- pos ++;
-
reiserfs_wait_on_write_block(inode->i_sb);
if (reiserfs_transaction_running(inode->i_sb))
th = current->journal_info;
@@ -3094,7 +3084,7 @@ void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode)
* decide if this buffer needs to stay around for data logging or ordered
* write purposes
*/
-static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
+static int invalidate_folio_can_drop(struct inode *inode, struct buffer_head *bh)
{
int ret = 1;
struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
@@ -3147,26 +3137,26 @@ free_jh:
return ret;
}
-/* clm -- taken from fs/buffer.c:block_invalidate_page */
-static void reiserfs_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+/* clm -- taken from fs/buffer.c:block_invalidate_folio */
+static void reiserfs_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
struct buffer_head *head, *bh, *next;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
unsigned int curr_off = 0;
unsigned int stop = offset + length;
- int partial_page = (offset || length < PAGE_SIZE);
+ int partial_page = (offset || length < folio_size(folio));
int ret = 1;
- BUG_ON(!PageLocked(page));
+ BUG_ON(!folio_test_locked(folio));
if (!partial_page)
- ClearPageChecked(page);
+ folio_clear_checked(folio);
- if (!page_has_buffers(page))
+ head = folio_buffers(folio);
+ if (!head)
goto out;
- head = page_buffers(page);
bh = head;
do {
unsigned int next_off = curr_off + bh->b_size;
@@ -3179,7 +3169,7 @@ static void reiserfs_invalidatepage(struct page *page, unsigned int offset,
* is this block fully invalidated?
*/
if (offset <= curr_off) {
- if (invalidatepage_can_drop(inode, bh))
+ if (invalidate_folio_can_drop(inode, bh))
reiserfs_unmap_buffer(bh);
else
ret = 0;
@@ -3194,21 +3184,21 @@ static void reiserfs_invalidatepage(struct page *page, unsigned int offset,
* so real IO is not possible anymore.
*/
if (!partial_page && ret) {
- ret = try_to_release_page(page, 0);
+ ret = filemap_release_folio(folio, 0);
/* maybe should BUG_ON(!ret); - neilb */
}
out:
return;
}
-static int reiserfs_set_page_dirty(struct page *page)
+static bool reiserfs_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
{
- struct inode *inode = page->mapping->host;
- if (reiserfs_file_data_log(inode)) {
- SetPageChecked(page);
- return __set_page_dirty_nobuffers(page);
+ if (reiserfs_file_data_log(mapping->host)) {
+ folio_set_checked(folio);
+ return filemap_dirty_folio(mapping, folio);
}
- return __set_page_dirty_buffers(page);
+ return block_dirty_folio(mapping, folio);
}
/*
@@ -3316,7 +3306,11 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
/* fill in hole pointers in the expanding truncate case. */
if (attr->ia_size > inode->i_size) {
- error = generic_cont_expand_simple(inode, attr->ia_size);
+ loff_t pos = attr->ia_size;
+
+ if ((pos & (inode->i_sb->s_blocksize - 1)) == 0)
+ pos++;
+ error = generic_cont_expand_simple(inode, pos);
if (REISERFS_I(inode)->i_prealloc_count > 0) {
int err;
struct reiserfs_transaction_handle th;
@@ -3430,10 +3424,10 @@ const struct address_space_operations reiserfs_address_space_operations = {
.readpage = reiserfs_readpage,
.readahead = reiserfs_readahead,
.releasepage = reiserfs_releasepage,
- .invalidatepage = reiserfs_invalidatepage,
+ .invalidate_folio = reiserfs_invalidate_folio,
.write_begin = reiserfs_write_begin,
.write_end = reiserfs_write_end,
.bmap = reiserfs_aop_bmap,
.direct_IO = reiserfs_direct_IO,
- .set_page_dirty = reiserfs_set_page_dirty,
+ .dirty_folio = reiserfs_dirty_folio,
};
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index a3e21160b634..b5b6f6201bed 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -858,8 +858,8 @@ loop_next:
ret = -EIO;
}
/*
- * ugly interaction with invalidatepage here.
- * reiserfs_invalidate_page will pin any buffer that has a
+ * ugly interaction with invalidate_folio here.
+ * reiserfs_invalidate_folio will pin any buffer that has a
* valid journal head from an older transaction. If someone
* else sets our buffer dirty after we write it in the first
* loop, and then someone truncates the page away, nobody
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 82e09901462e..cfb7c44c7366 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -639,7 +639,7 @@ static struct kmem_cache *reiserfs_inode_cachep;
static struct inode *reiserfs_alloc_inode(struct super_block *sb)
{
struct reiserfs_inode_info *ei;
- ei = kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, reiserfs_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
atomic_set(&ei->openers, 0);
@@ -1652,6 +1652,8 @@ static int read_super_block(struct super_block *s, int offset)
return 1;
}
+ reiserfs_warning(NULL, "", "reiserfs filesystem is deprecated and "
+ "scheduled to be removed from the kernel in 2025");
SB_BUFFER_WITH_SB(s) = bh;
SB_DISK_SUPER_BLOCK(s) = rs;
diff --git a/fs/remap_range.c b/fs/remap_range.c
index bc5fb006dc79..e112b5424cdb 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -146,11 +146,11 @@ static int generic_remap_check_len(struct inode *inode_in,
}
/* Read a page's worth of file data into the page cache. */
-static struct folio *vfs_dedupe_get_folio(struct inode *inode, loff_t pos)
+static struct folio *vfs_dedupe_get_folio(struct file *file, loff_t pos)
{
struct folio *folio;
- folio = read_mapping_folio(inode->i_mapping, pos >> PAGE_SHIFT, NULL);
+ folio = read_mapping_folio(file->f_mapping, pos >> PAGE_SHIFT, file);
if (IS_ERR(folio))
return folio;
if (!folio_test_uptodate(folio)) {
@@ -187,8 +187,8 @@ static void vfs_unlock_two_folios(struct folio *folio1, struct folio *folio2)
* Compare extents of two files to see if they are the same.
* Caller must have locked both inodes to prevent write races.
*/
-static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
- struct inode *dest, loff_t dstoff,
+static int vfs_dedupe_file_range_compare(struct file *src, loff_t srcoff,
+ struct file *dest, loff_t dstoff,
loff_t len, bool *is_same)
{
bool same = true;
@@ -224,8 +224,8 @@ static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
* someone is invalidating pages on us and we lose.
*/
if (!folio_test_uptodate(src_folio) || !folio_test_uptodate(dst_folio) ||
- src_folio->mapping != src->i_mapping ||
- dst_folio->mapping != dest->i_mapping) {
+ src_folio->mapping != src->f_mapping ||
+ dst_folio->mapping != dest->f_mapping) {
same = false;
goto unlock;
}
@@ -333,8 +333,8 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
if (remap_flags & REMAP_FILE_DEDUP) {
bool is_same = false;
- ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
- inode_out, pos_out, *len, &is_same);
+ ret = vfs_dedupe_file_range_compare(file_in, pos_in,
+ file_out, pos_out, *len, &is_same);
if (ret)
return ret;
if (!is_same)
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 259f684d9236..9e6bbb4219de 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -375,7 +375,7 @@ static struct inode *romfs_alloc_inode(struct super_block *sb)
{
struct romfs_inode_info *inode;
- inode = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
+ inode = alloc_inode_sb(sb, romfs_inode_cachep, GFP_KERNEL);
return inode ? &inode->vfs_inode : NULL;
}
diff --git a/fs/seq_file.c b/fs/seq_file.c
index f8e1f4ee87ff..7ab8a58c29b6 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -554,9 +554,9 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc)
}
EXPORT_SYMBOL(seq_dentry);
-static void *single_start(struct seq_file *p, loff_t *pos)
+void *single_start(struct seq_file *p, loff_t *pos)
{
- return NULL + (*pos == 0);
+ return *pos ? NULL : SEQ_START_TOKEN;
}
static void *single_next(struct seq_file *p, void *v, loff_t *pos)
diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h
index 38b8fc514860..0507aecfc669 100644
--- a/fs/smbfs_common/smb2pdu.h
+++ b/fs/smbfs_common/smb2pdu.h
@@ -61,6 +61,40 @@
#define NUMBER_OF_SMB2_COMMANDS 0x0013
/*
+ * Size of the session key (crypto key encrypted with the password
+ */
+#define SMB2_NTLMV2_SESSKEY_SIZE 16
+#define SMB2_SIGNATURE_SIZE 16
+#define SMB2_HMACSHA256_SIZE 32
+#define SMB2_CMACAES_SIZE 16
+#define SMB3_GCM128_CRYPTKEY_SIZE 16
+#define SMB3_GCM256_CRYPTKEY_SIZE 32
+
+/*
+ * Size of the smb3 encryption/decryption keys
+ * This size is big enough to store any cipher key types.
+ */
+#define SMB3_ENC_DEC_KEY_SIZE 32
+
+/*
+ * Size of the smb3 signing key
+ */
+#define SMB3_SIGN_KEY_SIZE 16
+
+#define CIFS_CLIENT_CHALLENGE_SIZE 8
+
+/* Maximum buffer size value we can send with 1 credit */
+#define SMB2_MAX_BUFFER_SIZE 65536
+
+/*
+ * The default wsize is 1M for SMB2 (and for some CIFS cases).
+ * find_get_pages seems to return a maximum of 256
+ * pages in a single call. With PAGE_SIZE == 4k, this means we can
+ * fill a single wsize request with a single call.
+ */
+#define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024)
+
+/*
* SMB2 Header Definition
*
* "MBZ" : Must be Zero
@@ -88,6 +122,15 @@
#define SMB2_FLAGS_DFS_OPERATIONS cpu_to_le32(0x10000000)
#define SMB2_FLAGS_REPLAY_OPERATION cpu_to_le32(0x20000000) /* SMB3 & up */
+/*
+ * Definitions for SMB2 Protocol Data Units (network frames)
+ *
+ * See MS-SMB2.PDF specification for protocol details.
+ * The Naming convention is the lower case version of the SMB2
+ * command code name for the struct. Note that structures must be packed.
+ *
+ */
+
/* See MS-SMB2 section 2.2.1 */
struct smb2_hdr {
__le32 ProtocolId; /* 0xFE 'S' 'M' 'B' */
@@ -115,6 +158,18 @@ struct smb2_pdu {
__le16 StructureSize2; /* size of wct area (varies, request specific) */
} __packed;
+#define SMB2_ERROR_STRUCTURE_SIZE2 9
+#define SMB2_ERROR_STRUCTURE_SIZE2_LE cpu_to_le16(SMB2_ERROR_STRUCTURE_SIZE2)
+
+struct smb2_err_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize;
+ __u8 ErrorContextCount;
+ __u8 Reserved;
+ __le32 ByteCount; /* even if zero, at least one byte follows */
+ __u8 ErrorData[1]; /* variable length */
+} __packed;
+
#define SMB3_AES_CCM_NONCE 11
#define SMB3_AES_GCM_NONCE 12
@@ -608,8 +663,8 @@ struct smb2_close_req {
__le16 StructureSize; /* Must be 24 */
__le16 Flags;
__le32 Reserved;
- __le64 PersistentFileId; /* opaque endianness */
- __le64 VolatileFileId; /* opaque endianness */
+ __u64 PersistentFileId; /* opaque endianness */
+ __u64 VolatileFileId; /* opaque endianness */
} __packed;
/*
@@ -653,8 +708,8 @@ struct smb2_read_req {
__u8 Flags; /* MBZ unless SMB3.02 or later */
__le32 Length;
__le64 Offset;
- __le64 PersistentFileId;
- __le64 VolatileFileId;
+ __u64 PersistentFileId;
+ __u64 VolatileFileId;
__le32 MinimumCount;
__le32 Channel; /* MBZ except for SMB3 or later */
__le32 RemainingBytes;
@@ -692,8 +747,8 @@ struct smb2_write_req {
__le16 DataOffset; /* offset from start of SMB2 header to write data */
__le32 Length;
__le64 Offset;
- __le64 PersistentFileId; /* opaque endianness */
- __le64 VolatileFileId; /* opaque endianness */
+ __u64 PersistentFileId; /* opaque endianness */
+ __u64 VolatileFileId; /* opaque endianness */
__le32 Channel; /* MBZ unless SMB3.02 or later */
__le32 RemainingBytes;
__le16 WriteChannelInfoOffset;
@@ -722,8 +777,8 @@ struct smb2_flush_req {
__le16 StructureSize; /* Must be 24 */
__le16 Reserved1;
__le32 Reserved2;
- __le64 PersistentFileId;
- __le64 VolatileFileId;
+ __u64 PersistentFileId;
+ __u64 VolatileFileId;
} __packed;
struct smb2_flush_rsp {
@@ -732,6 +787,123 @@ struct smb2_flush_rsp {
__le16 Reserved;
} __packed;
+#define SMB2_LOCKFLAG_SHARED 0x0001
+#define SMB2_LOCKFLAG_EXCLUSIVE 0x0002
+#define SMB2_LOCKFLAG_UNLOCK 0x0004
+#define SMB2_LOCKFLAG_FAIL_IMMEDIATELY 0x0010
+#define SMB2_LOCKFLAG_MASK 0x0007
+
+struct smb2_lock_element {
+ __le64 Offset;
+ __le64 Length;
+ __le32 Flags;
+ __le32 Reserved;
+} __packed;
+
+struct smb2_lock_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 48 */
+ __le16 LockCount;
+ /*
+ * The least significant four bits are the index, the other 28 bits are
+ * the lock sequence number (0 to 64). See MS-SMB2 2.2.26
+ */
+ __le32 LockSequenceNumber;
+ __u64 PersistentFileId;
+ __u64 VolatileFileId;
+ /* Followed by at least one */
+ struct smb2_lock_element locks[1];
+} __packed;
+
+struct smb2_lock_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 4 */
+ __le16 Reserved;
+} __packed;
+
+struct smb2_echo_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 4 */
+ __u16 Reserved;
+} __packed;
+
+struct smb2_echo_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 4 */
+ __u16 Reserved;
+} __packed;
+
+/*
+ * Valid FileInformation classes for query directory
+ *
+ * Note that these are a subset of the (file) QUERY_INFO levels defined
+ * later in this file (but since QUERY_DIRECTORY uses equivalent numbers
+ * we do not redefine them here)
+ *
+ * FileDirectoryInfomation 0x01
+ * FileFullDirectoryInformation 0x02
+ * FileIdFullDirectoryInformation 0x26
+ * FileBothDirectoryInformation 0x03
+ * FileIdBothDirectoryInformation 0x25
+ * FileNamesInformation 0x0C
+ * FileIdExtdDirectoryInformation 0x3C
+ */
+
+/* search (query_directory) Flags field */
+#define SMB2_RESTART_SCANS 0x01
+#define SMB2_RETURN_SINGLE_ENTRY 0x02
+#define SMB2_INDEX_SPECIFIED 0x04
+#define SMB2_REOPEN 0x10
+
+struct smb2_query_directory_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 33 */
+ __u8 FileInformationClass;
+ __u8 Flags;
+ __le32 FileIndex;
+ __u64 PersistentFileId;
+ __u64 VolatileFileId;
+ __le16 FileNameOffset;
+ __le16 FileNameLength;
+ __le32 OutputBufferLength;
+ __u8 Buffer[1];
+} __packed;
+
+struct smb2_query_directory_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 9 */
+ __le16 OutputBufferOffset;
+ __le32 OutputBufferLength;
+ __u8 Buffer[1];
+} __packed;
+
+/*
+ * Maximum number of iovs we need for a set-info request.
+ * The largest one is rename/hardlink
+ * [0] : struct smb2_set_info_req + smb2_file_[rename|link]_info
+ * [1] : path
+ * [2] : compound padding
+ */
+#define SMB2_SET_INFO_IOV_SIZE 3
+
+struct smb2_set_info_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 33 */
+ __u8 InfoType;
+ __u8 FileInfoClass;
+ __le32 BufferLength;
+ __le16 BufferOffset;
+ __u16 Reserved;
+ __le32 AdditionalInformation;
+ __u64 PersistentFileId;
+ __u64 VolatileFileId;
+ __u8 Buffer[1];
+} __packed;
+
+struct smb2_set_info_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 2 */
+} __packed;
/*
* SMB2_NOTIFY See MS-SMB2 section 2.2.35
@@ -769,8 +941,8 @@ struct smb2_change_notify_req {
__le16 StructureSize;
__le16 Flags;
__le32 OutputBufferLength;
- __le64 PersistentFileId; /* opaque endianness */
- __le64 VolatileFileId; /* opaque endianness */
+ __u64 PersistentFileId; /* opaque endianness */
+ __u64 VolatileFileId; /* opaque endianness */
__le32 CompletionFilter;
__u32 Reserved;
} __packed;
@@ -978,12 +1150,455 @@ struct smb2_create_rsp {
__le64 EndofFile;
__le32 FileAttributes;
__le32 Reserved2;
- __le64 PersistentFileId;
- __le64 VolatileFileId;
+ __u64 PersistentFileId;
+ __u64 VolatileFileId;
__le32 CreateContextsOffset;
__le32 CreateContextsLength;
__u8 Buffer[1];
} __packed;
+struct create_posix {
+ struct create_context ccontext;
+ __u8 Name[16];
+ __le32 Mode;
+ __u32 Reserved;
+} __packed;
+
+#define SMB2_LEASE_NONE_LE cpu_to_le32(0x00)
+#define SMB2_LEASE_READ_CACHING_LE cpu_to_le32(0x01)
+#define SMB2_LEASE_HANDLE_CACHING_LE cpu_to_le32(0x02)
+#define SMB2_LEASE_WRITE_CACHING_LE cpu_to_le32(0x04)
+
+#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS_LE cpu_to_le32(0x02)
+
+#define SMB2_LEASE_KEY_SIZE 16
+
+struct lease_context {
+ __u8 LeaseKey[SMB2_LEASE_KEY_SIZE];
+ __le32 LeaseState;
+ __le32 LeaseFlags;
+ __le64 LeaseDuration;
+} __packed;
+
+struct lease_context_v2 {
+ __u8 LeaseKey[SMB2_LEASE_KEY_SIZE];
+ __le32 LeaseState;
+ __le32 LeaseFlags;
+ __le64 LeaseDuration;
+ __u8 ParentLeaseKey[SMB2_LEASE_KEY_SIZE];
+ __le16 Epoch;
+ __le16 Reserved;
+} __packed;
+
+struct create_lease {
+ struct create_context ccontext;
+ __u8 Name[8];
+ struct lease_context lcontext;
+} __packed;
+
+struct create_lease_v2 {
+ struct create_context ccontext;
+ __u8 Name[8];
+ struct lease_context_v2 lcontext;
+ __u8 Pad[4];
+} __packed;
+
+/* See MS-SMB2 2.2.31 and 2.2.32 */
+struct smb2_ioctl_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 57 */
+ __le16 Reserved; /* offset from start of SMB2 header to write data */
+ __le32 CtlCode;
+ __u64 PersistentFileId;
+ __u64 VolatileFileId;
+ __le32 InputOffset; /* Reserved MBZ */
+ __le32 InputCount;
+ __le32 MaxInputResponse;
+ __le32 OutputOffset;
+ __le32 OutputCount;
+ __le32 MaxOutputResponse;
+ __le32 Flags;
+ __le32 Reserved2;
+ __u8 Buffer[];
+} __packed;
+
+struct smb2_ioctl_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 49 */
+ __le16 Reserved;
+ __le32 CtlCode;
+ __u64 PersistentFileId;
+ __u64 VolatileFileId;
+ __le32 InputOffset; /* Reserved MBZ */
+ __le32 InputCount;
+ __le32 OutputOffset;
+ __le32 OutputCount;
+ __le32 Flags;
+ __le32 Reserved2;
+ __u8 Buffer[];
+} __packed;
+
+/* this goes in the ioctl buffer when doing FSCTL_SET_ZERO_DATA */
+struct file_zero_data_information {
+ __le64 FileOffset;
+ __le64 BeyondFinalZero;
+} __packed;
+
+/* Reparse structures - see MS-FSCC 2.1.2 */
+
+/* struct fsctl_reparse_info_req is empty, only response structs (see below) */
+struct reparse_data_buffer {
+ __le32 ReparseTag;
+ __le16 ReparseDataLength;
+ __u16 Reserved;
+ __u8 DataBuffer[]; /* Variable Length */
+} __packed;
+
+struct reparse_guid_data_buffer {
+ __le32 ReparseTag;
+ __le16 ReparseDataLength;
+ __u16 Reserved;
+ __u8 ReparseGuid[16];
+ __u8 DataBuffer[]; /* Variable Length */
+} __packed;
+
+struct reparse_mount_point_data_buffer {
+ __le32 ReparseTag;
+ __le16 ReparseDataLength;
+ __u16 Reserved;
+ __le16 SubstituteNameOffset;
+ __le16 SubstituteNameLength;
+ __le16 PrintNameOffset;
+ __le16 PrintNameLength;
+ __u8 PathBuffer[]; /* Variable Length */
+} __packed;
+
+#define SYMLINK_FLAG_RELATIVE 0x00000001
+
+struct reparse_symlink_data_buffer {
+ __le32 ReparseTag;
+ __le16 ReparseDataLength;
+ __u16 Reserved;
+ __le16 SubstituteNameOffset;
+ __le16 SubstituteNameLength;
+ __le16 PrintNameOffset;
+ __le16 PrintNameLength;
+ __le32 Flags;
+ __u8 PathBuffer[]; /* Variable Length */
+} __packed;
+
+/* See MS-FSCC 2.1.2.6 and cifspdu.h for struct reparse_posix_data */
+
+struct validate_negotiate_info_req {
+ __le32 Capabilities;
+ __u8 Guid[SMB2_CLIENT_GUID_SIZE];
+ __le16 SecurityMode;
+ __le16 DialectCount;
+ __le16 Dialects[4]; /* BB expand this if autonegotiate > 4 dialects */
+} __packed;
+
+struct validate_negotiate_info_rsp {
+ __le32 Capabilities;
+ __u8 Guid[SMB2_CLIENT_GUID_SIZE];
+ __le16 SecurityMode;
+ __le16 Dialect; /* Dialect in use for the connection */
+} __packed;
+
+struct duplicate_extents_to_file {
+ __u64 PersistentFileHandle; /* source file handle, opaque endianness */
+ __u64 VolatileFileHandle;
+ __le64 SourceFileOffset;
+ __le64 TargetFileOffset;
+ __le64 ByteCount; /* Bytes to be copied */
+} __packed;
+
+/* Possible InfoType values */
+#define SMB2_O_INFO_FILE 0x01
+#define SMB2_O_INFO_FILESYSTEM 0x02
+#define SMB2_O_INFO_SECURITY 0x03
+#define SMB2_O_INFO_QUOTA 0x04
+
+/* SMB2 Query Info see MS-SMB2 (2.2.37) or MS-DTYP */
+
+/* List of QUERY INFO levels (those also valid for QUERY_DIR are noted below */
+#define FILE_DIRECTORY_INFORMATION 1 /* also for QUERY_DIR */
+#define FILE_FULL_DIRECTORY_INFORMATION 2 /* also for QUERY_DIR */
+#define FILE_BOTH_DIRECTORY_INFORMATION 3 /* also for QUERY_DIR */
+#define FILE_BASIC_INFORMATION 4
+#define FILE_STANDARD_INFORMATION 5
+#define FILE_INTERNAL_INFORMATION 6
+#define FILE_EA_INFORMATION 7
+#define FILE_ACCESS_INFORMATION 8
+#define FILE_NAME_INFORMATION 9
+#define FILE_RENAME_INFORMATION 10
+#define FILE_LINK_INFORMATION 11
+#define FILE_NAMES_INFORMATION 12 /* also for QUERY_DIR */
+#define FILE_DISPOSITION_INFORMATION 13
+#define FILE_POSITION_INFORMATION 14
+#define FILE_FULL_EA_INFORMATION 15
+#define FILE_MODE_INFORMATION 16
+#define FILE_ALIGNMENT_INFORMATION 17
+#define FILE_ALL_INFORMATION 18
+#define FILE_ALLOCATION_INFORMATION 19
+#define FILE_END_OF_FILE_INFORMATION 20
+#define FILE_ALTERNATE_NAME_INFORMATION 21
+#define FILE_STREAM_INFORMATION 22
+#define FILE_PIPE_INFORMATION 23
+#define FILE_PIPE_LOCAL_INFORMATION 24
+#define FILE_PIPE_REMOTE_INFORMATION 25
+#define FILE_MAILSLOT_QUERY_INFORMATION 26
+#define FILE_MAILSLOT_SET_INFORMATION 27
+#define FILE_COMPRESSION_INFORMATION 28
+#define FILE_OBJECT_ID_INFORMATION 29
+/* Number 30 not defined in documents */
+#define FILE_MOVE_CLUSTER_INFORMATION 31
+#define FILE_QUOTA_INFORMATION 32
+#define FILE_REPARSE_POINT_INFORMATION 33
+#define FILE_NETWORK_OPEN_INFORMATION 34
+#define FILE_ATTRIBUTE_TAG_INFORMATION 35
+#define FILE_TRACKING_INFORMATION 36
+#define FILEID_BOTH_DIRECTORY_INFORMATION 37 /* also for QUERY_DIR */
+#define FILEID_FULL_DIRECTORY_INFORMATION 38 /* also for QUERY_DIR */
+#define FILE_VALID_DATA_LENGTH_INFORMATION 39
+#define FILE_SHORT_NAME_INFORMATION 40
+#define FILE_SFIO_RESERVE_INFORMATION 44
+#define FILE_SFIO_VOLUME_INFORMATION 45
+#define FILE_HARD_LINK_INFORMATION 46
+#define FILE_NORMALIZED_NAME_INFORMATION 48
+#define FILEID_GLOBAL_TX_DIRECTORY_INFORMATION 50
+#define FILE_STANDARD_LINK_INFORMATION 54
+#define FILE_ID_INFORMATION 59
+#define FILE_ID_EXTD_DIRECTORY_INFORMATION 60 /* also for QUERY_DIR */
+/* Used for Query Info and Find File POSIX Info for SMB3.1.1 and SMB1 */
+#define SMB_FIND_FILE_POSIX_INFO 0x064
+
+/* Security info type additionalinfo flags. */
+#define OWNER_SECINFO 0x00000001
+#define GROUP_SECINFO 0x00000002
+#define DACL_SECINFO 0x00000004
+#define SACL_SECINFO 0x00000008
+#define LABEL_SECINFO 0x00000010
+#define ATTRIBUTE_SECINFO 0x00000020
+#define SCOPE_SECINFO 0x00000040
+#define BACKUP_SECINFO 0x00010000
+#define UNPROTECTED_SACL_SECINFO 0x10000000
+#define UNPROTECTED_DACL_SECINFO 0x20000000
+#define PROTECTED_SACL_SECINFO 0x40000000
+#define PROTECTED_DACL_SECINFO 0x80000000
+
+/* Flags used for FileFullEAinfo */
+#define SL_RESTART_SCAN 0x00000001
+#define SL_RETURN_SINGLE_ENTRY 0x00000002
+#define SL_INDEX_SPECIFIED 0x00000004
+
+struct smb2_query_info_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 41 */
+ __u8 InfoType;
+ __u8 FileInfoClass;
+ __le32 OutputBufferLength;
+ __le16 InputBufferOffset;
+ __u16 Reserved;
+ __le32 InputBufferLength;
+ __le32 AdditionalInformation;
+ __le32 Flags;
+ __u64 PersistentFileId;
+ __u64 VolatileFileId;
+ __u8 Buffer[1];
+} __packed;
+
+struct smb2_query_info_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 9 */
+ __le16 OutputBufferOffset;
+ __le32 OutputBufferLength;
+ __u8 Buffer[1];
+} __packed;
+
+/*
+ * PDU query infolevel structure definitions
+ */
+
+struct file_allocated_range_buffer {
+ __le64 file_offset;
+ __le64 length;
+} __packed;
+
+struct smb2_file_internal_info {
+ __le64 IndexNumber;
+} __packed; /* level 6 Query */
+
+struct smb2_file_rename_info { /* encoding of request for level 10 */
+ __u8 ReplaceIfExists; /* 1 = replace existing target with new */
+ /* 0 = fail if target already exists */
+ __u8 Reserved[7];
+ __u64 RootDirectory; /* MBZ for network operations (why says spec?) */
+ __le32 FileNameLength;
+ char FileName[]; /* New name to be assigned */
+ /* padding - overall struct size must be >= 24 so filename + pad >= 6 */
+} __packed; /* level 10 Set */
+
+struct smb2_file_link_info { /* encoding of request for level 11 */
+ __u8 ReplaceIfExists; /* 1 = replace existing link with new */
+ /* 0 = fail if link already exists */
+ __u8 Reserved[7];
+ __u64 RootDirectory; /* MBZ for network operations (why says spec?) */
+ __le32 FileNameLength;
+ char FileName[]; /* Name to be assigned to new link */
+} __packed; /* level 11 Set */
+
+/*
+ * This level 18, although with struct with same name is different from cifs
+ * level 0x107. Level 0x107 has an extra u64 between AccessFlags and
+ * CurrentByteOffset.
+ */
+struct smb2_file_all_info { /* data block encoding of response to level 18 */
+ __le64 CreationTime; /* Beginning of FILE_BASIC_INFO equivalent */
+ __le64 LastAccessTime;
+ __le64 LastWriteTime;
+ __le64 ChangeTime;
+ __le32 Attributes;
+ __u32 Pad1; /* End of FILE_BASIC_INFO_INFO equivalent */
+ __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */
+ __le64 EndOfFile; /* size ie offset to first free byte in file */
+ __le32 NumberOfLinks; /* hard links */
+ __u8 DeletePending;
+ __u8 Directory;
+ __u16 Pad2; /* End of FILE_STANDARD_INFO equivalent */
+ __le64 IndexNumber;
+ __le32 EASize;
+ __le32 AccessFlags;
+ __le64 CurrentByteOffset;
+ __le32 Mode;
+ __le32 AlignmentRequirement;
+ __le32 FileNameLength;
+ char FileName[1];
+} __packed; /* level 18 Query */
+
+struct smb2_file_eof_info { /* encoding of request for level 10 */
+ __le64 EndOfFile; /* new end of file value */
+} __packed; /* level 20 Set */
+
+/* Level 100 query info */
+struct smb311_posix_qinfo {
+ __le64 CreationTime;
+ __le64 LastAccessTime;
+ __le64 LastWriteTime;
+ __le64 ChangeTime;
+ __le64 EndOfFile;
+ __le64 AllocationSize;
+ __le32 DosAttributes;
+ __le64 Inode;
+ __le32 DeviceId;
+ __le32 Zero;
+ /* beginning of POSIX Create Context Response */
+ __le32 HardLinks;
+ __le32 ReparseTag;
+ __le32 Mode;
+ u8 Sids[];
+ /*
+ * var sized owner SID
+ * var sized group SID
+ * le32 filenamelength
+ * u8 filename[]
+ */
+} __packed;
+
+/* File System Information Classes */
+#define FS_VOLUME_INFORMATION 1 /* Query */
+#define FS_LABEL_INFORMATION 2 /* Set */
+#define FS_SIZE_INFORMATION 3 /* Query */
+#define FS_DEVICE_INFORMATION 4 /* Query */
+#define FS_ATTRIBUTE_INFORMATION 5 /* Query */
+#define FS_CONTROL_INFORMATION 6 /* Query, Set */
+#define FS_FULL_SIZE_INFORMATION 7 /* Query */
+#define FS_OBJECT_ID_INFORMATION 8 /* Query, Set */
+#define FS_DRIVER_PATH_INFORMATION 9 /* Query */
+#define FS_SECTOR_SIZE_INFORMATION 11 /* SMB3 or later. Query */
+#define FS_POSIX_INFORMATION 100 /* SMB3.1.1 POSIX. Query */
+
+struct smb2_fs_full_size_info {
+ __le64 TotalAllocationUnits;
+ __le64 CallerAvailableAllocationUnits;
+ __le64 ActualAvailableAllocationUnits;
+ __le32 SectorsPerAllocationUnit;
+ __le32 BytesPerSector;
+} __packed;
+
+#define SSINFO_FLAGS_ALIGNED_DEVICE 0x00000001
+#define SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE 0x00000002
+#define SSINFO_FLAGS_NO_SEEK_PENALTY 0x00000004
+#define SSINFO_FLAGS_TRIM_ENABLED 0x00000008
+
+/* sector size info struct */
+struct smb3_fs_ss_info {
+ __le32 LogicalBytesPerSector;
+ __le32 PhysicalBytesPerSectorForAtomicity;
+ __le32 PhysicalBytesPerSectorForPerf;
+ __le32 FSEffPhysicalBytesPerSectorForAtomicity;
+ __le32 Flags;
+ __le32 ByteOffsetForSectorAlignment;
+ __le32 ByteOffsetForPartitionAlignment;
+} __packed;
+
+/* File System Control Information */
+struct smb2_fs_control_info {
+ __le64 FreeSpaceStartFiltering;
+ __le64 FreeSpaceThreshold;
+ __le64 FreeSpaceStopFiltering;
+ __le64 DefaultQuotaThreshold;
+ __le64 DefaultQuotaLimit;
+ __le32 FileSystemControlFlags;
+ __le32 Padding;
+} __packed;
+
+/* volume info struct - see MS-FSCC 2.5.9 */
+#define MAX_VOL_LABEL_LEN 32
+struct smb3_fs_vol_info {
+ __le64 VolumeCreationTime;
+ __u32 VolumeSerialNumber;
+ __le32 VolumeLabelLength; /* includes trailing null */
+ __u8 SupportsObjects; /* True if eg like NTFS, supports objects */
+ __u8 Reserved;
+ __u8 VolumeLabel[]; /* variable len */
+} __packed;
+
+/* See MS-SMB2 2.2.23 through 2.2.25 */
+struct smb2_oplock_break {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 24 */
+ __u8 OplockLevel;
+ __u8 Reserved;
+ __le32 Reserved2;
+ __u64 PersistentFid;
+ __u64 VolatileFid;
+} __packed;
+
+#define SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED cpu_to_le32(0x01)
+
+struct smb2_lease_break {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 44 */
+ __le16 Epoch;
+ __le32 Flags;
+ __u8 LeaseKey[16];
+ __le32 CurrentLeaseState;
+ __le32 NewLeaseState;
+ __le32 BreakReason;
+ __le32 AccessMaskHint;
+ __le32 ShareMaskHint;
+} __packed;
+
+struct smb2_lease_ack {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 36 */
+ __le16 Reserved;
+ __le32 Flags;
+ __u8 LeaseKey[16];
+ __le32 LeaseState;
+ __le64 LeaseDuration;
+} __packed;
+#define OP_BREAK_STRUCT_SIZE_20 24
+#define OP_BREAK_STRUCT_SIZE_21 36
#endif /* _COMMON_SMB2PDU_H */
diff --git a/fs/splice.c b/fs/splice.c
index 5dbce4dcc1a7..047b79db8eb5 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -46,45 +46,45 @@
static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
- struct page *page = buf->page;
+ struct folio *folio = page_folio(buf->page);
struct address_space *mapping;
- lock_page(page);
+ folio_lock(folio);
- mapping = page_mapping(page);
+ mapping = folio_mapping(folio);
if (mapping) {
- WARN_ON(!PageUptodate(page));
+ WARN_ON(!folio_test_uptodate(folio));
/*
* At least for ext2 with nobh option, we need to wait on
- * writeback completing on this page, since we'll remove it
+ * writeback completing on this folio, since we'll remove it
* from the pagecache. Otherwise truncate wont wait on the
- * page, allowing the disk blocks to be reused by someone else
+ * folio, allowing the disk blocks to be reused by someone else
* before we actually wrote our data to them. fs corruption
* ensues.
*/
- wait_on_page_writeback(page);
+ folio_wait_writeback(folio);
- if (page_has_private(page) &&
- !try_to_release_page(page, GFP_KERNEL))
+ if (folio_has_private(folio) &&
+ !filemap_release_folio(folio, GFP_KERNEL))
goto out_unlock;
/*
* If we succeeded in removing the mapping, set LRU flag
* and return good.
*/
- if (remove_mapping(mapping, page)) {
+ if (remove_mapping(mapping, folio)) {
buf->flags |= PIPE_BUF_FLAG_LRU;
return true;
}
}
/*
- * Raced with truncate or failed to remove page from current
+ * Raced with truncate or failed to remove folio from current
* address space, unlock and return failure.
*/
out_unlock:
- unlock_page(page);
+ folio_unlock(folio);
return false;
}
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 2db8bcf7ff85..622c844f6d11 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -86,16 +86,17 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length,
int error, i;
struct bio *bio;
- if (page_count <= BIO_MAX_VECS)
- bio = bio_alloc(GFP_NOIO, page_count);
- else
+ if (page_count <= BIO_MAX_VECS) {
+ bio = bio_alloc(sb->s_bdev, page_count, REQ_OP_READ, GFP_NOIO);
+ } else {
bio = bio_kmalloc(GFP_NOIO, page_count);
+ bio_set_dev(bio, sb->s_bdev);
+ bio->bi_opf = REQ_OP_READ;
+ }
if (!bio)
return -ENOMEM;
- bio_set_dev(bio, sb->s_bdev);
- bio->bi_opf = READ;
bio->bi_iter.bi_sector = block * (msblk->devblksize >> SECTOR_SHIFT);
for (i = 0; i < page_count; ++i) {
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index b1b556dbce12..4f74abbc1a54 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -584,7 +584,7 @@ static void __exit exit_squashfs_fs(void)
static struct inode *squashfs_alloc_inode(struct super_block *sb)
{
struct squashfs_inode_info *ei =
- kmem_cache_alloc(squashfs_inode_cachep, GFP_KERNEL);
+ alloc_inode_sb(sb, squashfs_inode_cachep, GFP_KERNEL);
return ei ? &ei->vfs_inode : NULL;
}
diff --git a/fs/stat.c b/fs/stat.c
index 28d2020ba1f4..5c2c94464e8b 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -184,6 +184,20 @@ int vfs_fstat(int fd, struct kstat *stat)
return error;
}
+int getname_statx_lookup_flags(int flags)
+{
+ int lookup_flags = 0;
+
+ if (!(flags & AT_SYMLINK_NOFOLLOW))
+ lookup_flags |= LOOKUP_FOLLOW;
+ if (!(flags & AT_NO_AUTOMOUNT))
+ lookup_flags |= LOOKUP_AUTOMOUNT;
+ if (flags & AT_EMPTY_PATH)
+ lookup_flags |= LOOKUP_EMPTY;
+
+ return lookup_flags;
+}
+
/**
* vfs_statx - Get basic and extra attributes by filename
* @dfd: A file descriptor representing the base dir for a relative filename
@@ -199,26 +213,19 @@ int vfs_fstat(int fd, struct kstat *stat)
*
* 0 will be returned on success, and a -ve error code if unsuccessful.
*/
-static int vfs_statx(int dfd, const char __user *filename, int flags,
+static int vfs_statx(int dfd, struct filename *filename, int flags,
struct kstat *stat, u32 request_mask)
{
struct path path;
- unsigned lookup_flags = 0;
+ unsigned int lookup_flags = getname_statx_lookup_flags(flags);
int error;
if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | AT_EMPTY_PATH |
AT_STATX_SYNC_TYPE))
return -EINVAL;
- if (!(flags & AT_SYMLINK_NOFOLLOW))
- lookup_flags |= LOOKUP_FOLLOW;
- if (!(flags & AT_NO_AUTOMOUNT))
- lookup_flags |= LOOKUP_AUTOMOUNT;
- if (flags & AT_EMPTY_PATH)
- lookup_flags |= LOOKUP_EMPTY;
-
retry:
- error = user_path_at(dfd, filename, lookup_flags, &path);
+ error = filename_lookup(dfd, filename, lookup_flags, &path, NULL);
if (error)
goto out;
@@ -240,8 +247,15 @@ out:
int vfs_fstatat(int dfd, const char __user *filename,
struct kstat *stat, int flags)
{
- return vfs_statx(dfd, filename, flags | AT_NO_AUTOMOUNT,
- stat, STATX_BASIC_STATS);
+ int ret;
+ int statx_flags = flags | AT_NO_AUTOMOUNT;
+ struct filename *name;
+
+ name = getname_flags(filename, getname_statx_lookup_flags(statx_flags), NULL);
+ ret = vfs_statx(dfd, name, statx_flags, stat, STATX_BASIC_STATS);
+ putname(name);
+
+ return ret;
}
#ifdef __ARCH_WANT_OLD_STAT
@@ -334,9 +348,6 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat
# define choose_32_64(a,b) b
#endif
-#define valid_dev(x) choose_32_64(old_valid_dev(x),true)
-#define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x)
-
#ifndef INIT_STRUCT_STAT_PADDING
# define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st))
#endif
@@ -345,7 +356,9 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
{
struct stat tmp;
- if (!valid_dev(stat->dev) || !valid_dev(stat->rdev))
+ if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev))
+ return -EOVERFLOW;
+ if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev))
return -EOVERFLOW;
#if BITS_PER_LONG == 32
if (stat->size > MAX_NON_LFS)
@@ -353,7 +366,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
#endif
INIT_STRUCT_STAT_PADDING(tmp);
- tmp.st_dev = encode_dev(stat->dev);
+ tmp.st_dev = new_encode_dev(stat->dev);
tmp.st_ino = stat->ino;
if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
return -EOVERFLOW;
@@ -363,7 +376,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
return -EOVERFLOW;
SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
- tmp.st_rdev = encode_dev(stat->rdev);
+ tmp.st_rdev = new_encode_dev(stat->rdev);
tmp.st_size = stat->size;
tmp.st_atime = stat->atime.tv_sec;
tmp.st_mtime = stat->mtime.tv_sec;
@@ -602,7 +615,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
}
-int do_statx(int dfd, const char __user *filename, unsigned flags,
+int do_statx(int dfd, struct filename *filename, unsigned int flags,
unsigned int mask, struct statx __user *buffer)
{
struct kstat stat;
@@ -636,7 +649,14 @@ SYSCALL_DEFINE5(statx,
unsigned int, mask,
struct statx __user *, buffer)
{
- return do_statx(dfd, filename, flags, mask, buffer);
+ int ret;
+ struct filename *name;
+
+ name = getname_flags(filename, getname_statx_lookup_flags(flags), NULL);
+ ret = do_statx(dfd, name, flags, mask, buffer);
+ putname(name);
+
+ return ret;
}
#ifdef CONFIG_COMPAT
@@ -644,11 +664,13 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
{
struct compat_stat tmp;
- if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev))
+ if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev))
+ return -EOVERFLOW;
+ if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev))
return -EOVERFLOW;
memset(&tmp, 0, sizeof(tmp));
- tmp.st_dev = old_encode_dev(stat->dev);
+ tmp.st_dev = new_encode_dev(stat->dev);
tmp.st_ino = stat->ino;
if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
return -EOVERFLOW;
@@ -658,7 +680,7 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
return -EOVERFLOW;
SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
- tmp.st_rdev = old_encode_dev(stat->rdev);
+ tmp.st_rdev = new_encode_dev(stat->rdev);
if ((u64) stat->size > MAX_NON_LFS)
return -EOVERFLOW;
tmp.st_size = stat->size;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 42dcf96881b6..a12ac0356c69 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -703,19 +703,6 @@ int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid)
ktype = get_ktype(kobj);
if (ktype) {
- struct attribute **kattr;
-
- /*
- * Change owner of the default attributes associated with the
- * ktype of @kobj.
- */
- for (kattr = ktype->default_attrs; kattr && *kattr; kattr++) {
- error = sysfs_file_change_owner(kobj, (*kattr)->name,
- kuid, kgid);
- if (error)
- return error;
- }
-
/*
* Change owner of the default groups associated with the
* ktype of @kobj.
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index e747c135c1d1..98467bb76737 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -103,7 +103,7 @@ int __init sysfs_init(void)
if (IS_ERR(sysfs_root))
return PTR_ERR(sysfs_root);
- sysfs_root_kn = sysfs_root->kn;
+ sysfs_root_kn = kernfs_root_to_node(sysfs_root);
err = register_filesystem(&sysfs_fs_type);
if (err) {
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index be47263b8605..9e8d4a6fb2f3 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -306,7 +306,7 @@ static struct inode *sysv_alloc_inode(struct super_block *sb)
{
struct sysv_inode_info *si;
- si = kmem_cache_alloc(sysv_inode_cachep, GFP_KERNEL);
+ si = alloc_inode_sb(sb, sysv_inode_cachep, GFP_KERNEL);
if (!si)
return NULL;
return &si->vfs_inode;
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 749385015a8d..409ab5e17803 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -495,7 +495,8 @@ static sector_t sysv_bmap(struct address_space *mapping, sector_t block)
}
const struct address_space_operations sysv_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = sysv_readpage,
.writepage = sysv_writepage,
.write_begin = sysv_write_begin,
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index dbe72f664abf..86151889548e 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -349,20 +349,97 @@ out_budg:
return err;
}
-static int do_tmpfile(struct inode *dir, struct dentry *dentry,
- umode_t mode, struct inode **whiteout)
+static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry)
+{
+ int err;
+ umode_t mode = S_IFCHR | WHITEOUT_MODE;
+ struct inode *inode;
+ struct ubifs_info *c = dir->i_sb->s_fs_info;
+ struct fscrypt_name nm;
+
+ /*
+ * Create an inode('nlink = 1') for whiteout without updating journal,
+ * let ubifs_jnl_rename() store it on flash to complete rename whiteout
+ * atomically.
+ */
+
+ dbg_gen("dent '%pd', mode %#hx in dir ino %lu",
+ dentry, mode, dir->i_ino);
+
+ err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
+ if (err)
+ return ERR_PTR(err);
+
+ inode = ubifs_new_inode(c, dir, mode);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out_free;
+ }
+
+ init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
+ ubifs_assert(c, inode->i_op == &ubifs_file_inode_operations);
+
+ err = ubifs_init_security(dir, inode, &dentry->d_name);
+ if (err)
+ goto out_inode;
+
+ /* The dir size is updated by do_rename. */
+ insert_inode_hash(inode);
+
+ return inode;
+
+out_inode:
+ make_bad_inode(inode);
+ iput(inode);
+out_free:
+ fscrypt_free_filename(&nm);
+ ubifs_err(c, "cannot create whiteout file, error %d", err);
+ return ERR_PTR(err);
+}
+
+/**
+ * lock_2_inodes - a wrapper for locking two UBIFS inodes.
+ * @inode1: first inode
+ * @inode2: second inode
+ *
+ * We do not implement any tricks to guarantee strict lock ordering, because
+ * VFS has already done it for us on the @i_mutex. So this is just a simple
+ * wrapper function.
+ */
+static void lock_2_inodes(struct inode *inode1, struct inode *inode2)
+{
+ mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
+ mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
+}
+
+/**
+ * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes.
+ * @inode1: first inode
+ * @inode2: second inode
+ */
+static void unlock_2_inodes(struct inode *inode1, struct inode *inode2)
+{
+ mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
+ mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
+}
+
+static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode *inode;
struct ubifs_info *c = dir->i_sb->s_fs_info;
- struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1};
+ struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+ .dirtied_ino = 1};
struct ubifs_budget_req ino_req = { .dirtied_ino = 1 };
- struct ubifs_inode *ui, *dir_ui = ubifs_inode(dir);
+ struct ubifs_inode *ui;
int err, instantiated = 0;
struct fscrypt_name nm;
/*
- * Budget request settings: new dirty inode, new direntry,
- * budget for dirtied inode will be released via writeback.
+ * Budget request settings: new inode, new direntry, changing the
+ * parent directory inode.
+ * Allocate budget separately for new dirtied inode, the budget will
+ * be released via writeback.
*/
dbg_gen("dent '%pd', mode %#hx in dir ino %lu",
@@ -392,42 +469,30 @@ static int do_tmpfile(struct inode *dir, struct dentry *dentry,
}
ui = ubifs_inode(inode);
- if (whiteout) {
- init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
- ubifs_assert(c, inode->i_op == &ubifs_file_inode_operations);
- }
-
err = ubifs_init_security(dir, inode, &dentry->d_name);
if (err)
goto out_inode;
mutex_lock(&ui->ui_mutex);
insert_inode_hash(inode);
-
- if (whiteout) {
- mark_inode_dirty(inode);
- drop_nlink(inode);
- *whiteout = inode;
- } else {
- d_tmpfile(dentry, inode);
- }
+ d_tmpfile(dentry, inode);
ubifs_assert(c, ui->dirty);
instantiated = 1;
mutex_unlock(&ui->ui_mutex);
- mutex_lock(&dir_ui->ui_mutex);
+ lock_2_inodes(dir, inode);
err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0);
if (err)
goto out_cancel;
- mutex_unlock(&dir_ui->ui_mutex);
+ unlock_2_inodes(dir, inode);
ubifs_release_budget(c, &req);
return 0;
out_cancel:
- mutex_unlock(&dir_ui->ui_mutex);
+ unlock_2_inodes(dir, inode);
out_inode:
make_bad_inode(inode);
if (!instantiated)
@@ -441,12 +506,6 @@ out_budg:
return err;
}
-static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
- struct dentry *dentry, umode_t mode)
-{
- return do_tmpfile(dir, dentry, mode, NULL);
-}
-
/**
* vfs_dent_type - get VFS directory entry type.
* @type: UBIFS directory entry type
@@ -660,32 +719,6 @@ static int ubifs_dir_release(struct inode *dir, struct file *file)
return 0;
}
-/**
- * lock_2_inodes - a wrapper for locking two UBIFS inodes.
- * @inode1: first inode
- * @inode2: second inode
- *
- * We do not implement any tricks to guarantee strict lock ordering, because
- * VFS has already done it for us on the @i_mutex. So this is just a simple
- * wrapper function.
- */
-static void lock_2_inodes(struct inode *inode1, struct inode *inode2)
-{
- mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
- mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
-}
-
-/**
- * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes.
- * @inode1: first inode
- * @inode2: second inode
- */
-static void unlock_2_inodes(struct inode *inode1, struct inode *inode2)
-{
- mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
- mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
-}
-
static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
@@ -949,7 +982,8 @@ static int ubifs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
struct ubifs_inode *dir_ui = ubifs_inode(dir);
struct ubifs_info *c = dir->i_sb->s_fs_info;
int err, sz_change;
- struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 };
+ struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+ .dirtied_ino = 1};
struct fscrypt_name nm;
/*
@@ -1264,17 +1298,19 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
.dirtied_ino = 3 };
struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
.dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
+ struct ubifs_budget_req wht_req;
struct timespec64 time;
unsigned int saved_nlink;
struct fscrypt_name old_nm, new_nm;
/*
- * Budget request settings: deletion direntry, new direntry, removing
- * the old inode, and changing old and new parent directory inodes.
+ * Budget request settings:
+ * req: deletion direntry, new direntry, removing the old inode,
+ * and changing old and new parent directory inodes.
+ *
+ * wht_req: new whiteout inode for RENAME_WHITEOUT.
*
- * However, this operation also marks the target inode as dirty and
- * does not write it, so we allocate budget for the target inode
- * separately.
+ * ino_req: marks the target inode as dirty and does not write it.
*/
dbg_gen("dent '%pd' ino %lu in dir ino %lu to dent '%pd' in dir ino %lu flags 0x%x",
@@ -1331,20 +1367,44 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
goto out_release;
}
- err = do_tmpfile(old_dir, old_dentry, S_IFCHR | WHITEOUT_MODE, &whiteout);
- if (err) {
+ /*
+ * The whiteout inode without dentry is pinned in memory,
+ * umount won't happen during rename process because we
+ * got parent dentry.
+ */
+ whiteout = create_whiteout(old_dir, old_dentry);
+ if (IS_ERR(whiteout)) {
+ err = PTR_ERR(whiteout);
kfree(dev);
goto out_release;
}
- spin_lock(&whiteout->i_lock);
- whiteout->i_state |= I_LINKABLE;
- spin_unlock(&whiteout->i_lock);
-
whiteout_ui = ubifs_inode(whiteout);
whiteout_ui->data = dev;
whiteout_ui->data_len = ubifs_encode_dev(dev, MKDEV(0, 0));
ubifs_assert(c, !whiteout_ui->dirty);
+
+ memset(&wht_req, 0, sizeof(struct ubifs_budget_req));
+ wht_req.new_ino = 1;
+ wht_req.new_ino_d = ALIGN(whiteout_ui->data_len, 8);
+ /*
+ * To avoid deadlock between space budget (holds ui_mutex and
+ * waits wb work) and writeback work(waits ui_mutex), do space
+ * budget before ubifs inodes locked.
+ */
+ err = ubifs_budget_space(c, &wht_req);
+ if (err) {
+ /*
+ * Whiteout inode can not be written on flash by
+ * ubifs_jnl_write_inode(), because it's neither
+ * dirty nor zero-nlink.
+ */
+ iput(whiteout);
+ goto out_release;
+ }
+
+ /* Add the old_dentry size to the old_dir size. */
+ old_sz -= CALC_DENT_SIZE(fname_len(&old_nm));
}
lock_4_inodes(old_dir, new_dir, new_inode, whiteout);
@@ -1416,29 +1476,11 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir);
if (unlink && IS_SYNC(new_inode))
sync = 1;
- }
-
- if (whiteout) {
- struct ubifs_budget_req wht_req = { .dirtied_ino = 1,
- .dirtied_ino_d = \
- ALIGN(ubifs_inode(whiteout)->data_len, 8) };
-
- err = ubifs_budget_space(c, &wht_req);
- if (err) {
- kfree(whiteout_ui->data);
- whiteout_ui->data_len = 0;
- iput(whiteout);
- goto out_release;
- }
-
- inc_nlink(whiteout);
- mark_inode_dirty(whiteout);
-
- spin_lock(&whiteout->i_lock);
- whiteout->i_state &= ~I_LINKABLE;
- spin_unlock(&whiteout->i_lock);
-
- iput(whiteout);
+ /*
+ * S_SYNC flag of whiteout inherits from the old_dir, and we
+ * have already checked the old dir inode. So there is no need
+ * to check whiteout.
+ */
}
err = ubifs_jnl_rename(c, old_dir, old_inode, &old_nm, new_dir,
@@ -1449,6 +1491,11 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
unlock_4_inodes(old_dir, new_dir, new_inode, whiteout);
ubifs_release_budget(c, &req);
+ if (whiteout) {
+ ubifs_release_budget(c, &wht_req);
+ iput(whiteout);
+ }
+
mutex_lock(&old_inode_ui->ui_mutex);
release = old_inode_ui->dirty;
mark_inode_dirty_sync(old_inode);
@@ -1457,11 +1504,16 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
if (release)
ubifs_release_budget(c, &ino_req);
if (IS_SYNC(old_inode))
- err = old_inode->i_sb->s_op->write_inode(old_inode, NULL);
+ /*
+ * Rename finished here. Although old inode cannot be updated
+ * on flash, old ctime is not a big problem, don't return err
+ * code to userspace.
+ */
+ old_inode->i_sb->s_op->write_inode(old_inode, NULL);
fscrypt_free_filename(&old_nm);
fscrypt_free_filename(&new_nm);
- return err;
+ return 0;
out_cancel:
if (unlink) {
@@ -1482,11 +1534,11 @@ out_cancel:
inc_nlink(old_dir);
}
}
+ unlock_4_inodes(old_dir, new_dir, new_inode, whiteout);
if (whiteout) {
- drop_nlink(whiteout);
+ ubifs_release_budget(c, &wht_req);
iput(whiteout);
}
- unlock_4_inodes(old_dir, new_dir, new_inode, whiteout);
out_release:
ubifs_release_budget(c, &ino_req);
ubifs_release_budget(c, &req);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 5cfa28cd00cd..0383fbdc95ff 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -570,7 +570,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
}
if (!PagePrivate(page)) {
- SetPagePrivate(page);
+ attach_page_private(page, (void *)1);
atomic_long_inc(&c->dirty_pg_cnt);
__set_page_dirty_nobuffers(page);
}
@@ -947,7 +947,7 @@ static int do_writepage(struct page *page, int len)
release_existing_page_budget(c);
atomic_long_dec(&c->dirty_pg_cnt);
- ClearPagePrivate(page);
+ detach_page_private(page);
ClearPageChecked(page);
kunmap(page);
@@ -1287,25 +1287,25 @@ int ubifs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
return err;
}
-static void ubifs_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void ubifs_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
- ubifs_assert(c, PagePrivate(page));
- if (offset || length < PAGE_SIZE)
- /* Partial page remains dirty */
+ ubifs_assert(c, folio_test_private(folio));
+ if (offset || length < folio_size(folio))
+ /* Partial folio remains dirty */
return;
- if (PageChecked(page))
+ if (folio_test_checked(folio))
release_new_page_budget(c);
else
release_existing_page_budget(c);
atomic_long_dec(&c->dirty_pg_cnt);
- ClearPagePrivate(page);
- ClearPageChecked(page);
+ folio_detach_private(folio);
+ folio_clear_checked(folio);
}
int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
@@ -1445,18 +1445,18 @@ static ssize_t ubifs_write_iter(struct kiocb *iocb, struct iov_iter *from)
return generic_file_write_iter(iocb, from);
}
-static int ubifs_set_page_dirty(struct page *page)
+static bool ubifs_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
{
- int ret;
- struct inode *inode = page->mapping->host;
- struct ubifs_info *c = inode->i_sb->s_fs_info;
+ bool ret;
+ struct ubifs_info *c = mapping->host->i_sb->s_fs_info;
- ret = __set_page_dirty_nobuffers(page);
+ ret = filemap_dirty_folio(mapping, folio);
/*
* An attempt to dirty a page without budgeting for it - should not
* happen.
*/
- ubifs_assert(c, ret == 0);
+ ubifs_assert(c, ret == false);
return ret;
}
@@ -1471,8 +1471,8 @@ static int ubifs_migrate_page(struct address_space *mapping,
return rc;
if (PagePrivate(page)) {
- ClearPagePrivate(page);
- SetPagePrivate(newpage);
+ detach_page_private(page);
+ attach_page_private(newpage, (void *)1);
}
if (mode != MIGRATE_SYNC_NO_COPY)
@@ -1496,7 +1496,7 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
return 0;
ubifs_assert(c, PagePrivate(page));
ubifs_assert(c, 0);
- ClearPagePrivate(page);
+ detach_page_private(page);
ClearPageChecked(page);
return 1;
}
@@ -1567,7 +1567,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
else {
if (!PageChecked(page))
ubifs_convert_page_budget(c);
- SetPagePrivate(page);
+ attach_page_private(page, (void *)1);
atomic_long_inc(&c->dirty_pg_cnt);
__set_page_dirty_nobuffers(page);
}
@@ -1646,8 +1646,8 @@ const struct address_space_operations ubifs_file_address_operations = {
.writepage = ubifs_writepage,
.write_begin = ubifs_write_begin,
.write_end = ubifs_write_end,
- .invalidatepage = ubifs_invalidatepage,
- .set_page_dirty = ubifs_set_page_dirty,
+ .invalidate_folio = ubifs_invalidate_folio,
+ .dirty_folio = ubifs_dirty_folio,
#ifdef CONFIG_MIGRATION
.migratepage = ubifs_migrate_page,
#endif
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 789a7813f3fa..1607a3c76681 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -854,16 +854,42 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
*/
n = aligned_len >> c->max_write_shift;
if (n) {
- n <<= c->max_write_shift;
+ int m = n - 1;
+
dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum,
wbuf->offs);
- err = ubifs_leb_write(c, wbuf->lnum, buf + written,
- wbuf->offs, n);
+
+ if (m) {
+ /* '(n-1)<<c->max_write_shift < len' is always true. */
+ m <<= c->max_write_shift;
+ err = ubifs_leb_write(c, wbuf->lnum, buf + written,
+ wbuf->offs, m);
+ if (err)
+ goto out;
+ wbuf->offs += m;
+ aligned_len -= m;
+ len -= m;
+ written += m;
+ }
+
+ /*
+ * The non-written len of buf may be less than 'n' because
+ * parameter 'len' is not 8 bytes aligned, so here we read
+ * min(len, n) bytes from buf.
+ */
+ n = 1 << c->max_write_shift;
+ memcpy(wbuf->buf, buf + written, min(len, n));
+ if (n > len) {
+ ubifs_assert(c, n - len < 8);
+ ubifs_pad(c, wbuf->buf + len, n - len);
+ }
+
+ err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, n);
if (err)
goto out;
wbuf->offs += n;
aligned_len -= n;
- len -= n;
+ len -= min(len, n);
written += n;
}
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index c6a863487780..71bcebe45f9c 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -108,7 +108,7 @@ static int setflags(struct inode *inode, int flags)
struct ubifs_inode *ui = ubifs_inode(inode);
struct ubifs_info *c = inode->i_sb->s_fs_info;
struct ubifs_budget_req req = { .dirtied_ino = 1,
- .dirtied_ino_d = ui->data_len };
+ .dirtied_ino_d = ALIGN(ui->data_len, 8) };
err = ubifs_budget_space(c, &req);
if (err)
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 8ea680dba61e..75dab0ae3939 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -1207,9 +1207,9 @@ out_free:
* @sync: non-zero if the write-buffer has to be synchronized
*
* This function implements the re-name operation which may involve writing up
- * to 4 inodes and 2 directory entries. It marks the written inodes as clean
- * and returns zero on success. In case of failure, a negative error code is
- * returned.
+ * to 4 inodes(new inode, whiteout inode, old and new parent directory inodes)
+ * and 2 directory entries. It marks the written inodes as clean and returns
+ * zero on success. In case of failure, a negative error code is returned.
*/
int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
const struct inode *old_inode,
@@ -1222,14 +1222,15 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
void *p;
union ubifs_key key;
struct ubifs_dent_node *dent, *dent2;
- int err, dlen1, dlen2, ilen, lnum, offs, len, orphan_added = 0;
+ int err, dlen1, dlen2, ilen, wlen, lnum, offs, len, orphan_added = 0;
int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ;
int last_reference = !!(new_inode && new_inode->i_nlink == 0);
int move = (old_dir != new_dir);
- struct ubifs_inode *new_ui;
+ struct ubifs_inode *new_ui, *whiteout_ui;
u8 hash_old_dir[UBIFS_HASH_ARR_SZ];
u8 hash_new_dir[UBIFS_HASH_ARR_SZ];
u8 hash_new_inode[UBIFS_HASH_ARR_SZ];
+ u8 hash_whiteout_inode[UBIFS_HASH_ARR_SZ];
u8 hash_dent1[UBIFS_HASH_ARR_SZ];
u8 hash_dent2[UBIFS_HASH_ARR_SZ];
@@ -1249,9 +1250,20 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
} else
ilen = 0;
+ if (whiteout) {
+ whiteout_ui = ubifs_inode(whiteout);
+ ubifs_assert(c, mutex_is_locked(&whiteout_ui->ui_mutex));
+ ubifs_assert(c, whiteout->i_nlink == 1);
+ ubifs_assert(c, !whiteout_ui->dirty);
+ wlen = UBIFS_INO_NODE_SZ;
+ wlen += whiteout_ui->data_len;
+ } else
+ wlen = 0;
+
aligned_dlen1 = ALIGN(dlen1, 8);
aligned_dlen2 = ALIGN(dlen2, 8);
- len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + ALIGN(plen, 8);
+ len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) +
+ ALIGN(wlen, 8) + ALIGN(plen, 8);
if (move)
len += plen;
@@ -1313,6 +1325,15 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
p += ALIGN(ilen, 8);
}
+ if (whiteout) {
+ pack_inode(c, p, whiteout, 0);
+ err = ubifs_node_calc_hash(c, p, hash_whiteout_inode);
+ if (err)
+ goto out_release;
+
+ p += ALIGN(wlen, 8);
+ }
+
if (!move) {
pack_inode(c, p, old_dir, 1);
err = ubifs_node_calc_hash(c, p, hash_old_dir);
@@ -1352,6 +1373,9 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
if (new_inode)
ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf,
new_inode->i_ino);
+ if (whiteout)
+ ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf,
+ whiteout->i_ino);
}
release_head(c, BASEHD);
@@ -1368,8 +1392,6 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, hash_dent2, old_nm);
if (err)
goto out_ro;
-
- ubifs_delete_orphan(c, whiteout->i_ino);
} else {
err = ubifs_add_dirt(c, lnum, dlen2);
if (err)
@@ -1390,6 +1412,15 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
offs += ALIGN(ilen, 8);
}
+ if (whiteout) {
+ ino_key_init(c, &key, whiteout->i_ino);
+ err = ubifs_tnc_add(c, &key, lnum, offs, wlen,
+ hash_whiteout_inode);
+ if (err)
+ goto out_ro;
+ offs += ALIGN(wlen, 8);
+ }
+
ino_key_init(c, &key, old_dir->i_ino);
err = ubifs_tnc_add(c, &key, lnum, offs, plen, hash_old_dir);
if (err)
@@ -1410,6 +1441,11 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
new_ui->synced_i_size = new_ui->ui_size;
spin_unlock(&new_ui->ui_lock);
}
+ /*
+ * No need to mark whiteout inode clean.
+ * Whiteout doesn't have non-zero size, no need to update
+ * synced_i_size for whiteout_ui.
+ */
mark_inode_clean(c, ubifs_inode(old_dir));
if (move)
mark_inode_clean(c, ubifs_inode(new_dir));
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index aa7a1381c457..bad67455215f 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -268,7 +268,7 @@ static struct inode *ubifs_alloc_inode(struct super_block *sb)
{
struct ubifs_inode *ui;
- ui = kmem_cache_alloc(ubifs_inode_slab, GFP_NOFS);
+ ui = alloc_inode_sb(sb, ubifs_inode_slab, GFP_NOFS);
if (!ui)
return NULL;
diff --git a/fs/ubifs/sysfs.c b/fs/ubifs/sysfs.c
index 7acc5a74e5fa..06ad8fa1fcfb 100644
--- a/fs/ubifs/sysfs.c
+++ b/fs/ubifs/sysfs.c
@@ -42,6 +42,7 @@ static struct attribute *ubifs_attrs[] = {
ATTR_LIST(errors_crc),
NULL,
};
+ATTRIBUTE_GROUPS(ubifs);
static ssize_t ubifs_attr_show(struct kobject *kobj,
struct attribute *attr, char *buf)
@@ -74,7 +75,7 @@ static const struct sysfs_ops ubifs_attr_ops = {
};
static struct kobj_type ubifs_sb_ktype = {
- .default_attrs = ubifs_attrs,
+ .default_groups = ubifs_groups,
.sysfs_ops = &ubifs_attr_ops,
.release = ubifs_sb_release,
};
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index f55828c0a300..008fa46ef61e 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -381,7 +381,7 @@ struct ubifs_gced_idx_leb {
* @ui_mutex exists for two main reasons. At first it prevents inodes from
* being written back while UBIFS changing them, being in the middle of an VFS
* operation. This way UBIFS makes sure the inode fields are consistent. For
- * example, in 'ubifs_rename()' we change 3 inodes simultaneously, and
+ * example, in 'ubifs_rename()' we change 4 inodes simultaneously, and
* write-back must not write any of them before we have finished.
*
* The second reason is budgeting - UBIFS has to budget all operations. If an
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 1baff8ddb754..0f6bf2504437 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -125,7 +125,8 @@ static int udf_adinicb_write_end(struct file *file, struct address_space *mappin
}
const struct address_space_operations udf_adinicb_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = udf_adinicb_readpage,
.writepage = udf_adinicb_writepage,
.write_begin = udf_adinicb_write_begin,
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index ea8f6cd01f50..ca4fa710e562 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -235,7 +235,8 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block)
}
const struct address_space_operations udf_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = udf_readpage,
.readahead = udf_readahead,
.writepage = udf_writepage,
diff --git a/fs/udf/super.c b/fs/udf/super.c
index f26b5e0b84b6..4042d9739fb7 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -136,7 +136,7 @@ static struct kmem_cache *udf_inode_cachep;
static struct inode *udf_alloc_inode(struct super_block *sb)
{
struct udf_inode_info *ei;
- ei = kmem_cache_alloc(udf_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, udf_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
@@ -2474,7 +2474,6 @@ static unsigned int udf_count_free_table(struct super_block *sb,
unsigned int accum = 0;
uint32_t elen;
struct kernel_lb_addr eloc;
- int8_t etype;
struct extent_position epos;
mutex_lock(&UDF_SB(sb)->s_alloc_mutex);
@@ -2482,7 +2481,7 @@ static unsigned int udf_count_free_table(struct super_block *sb,
epos.offset = sizeof(struct unallocSpaceEntry);
epos.bh = NULL;
- while ((etype = udf_next_aext(table, &epos, &eloc, &elen, 1)) != -1)
+ while (udf_next_aext(table, &epos, &eloc, &elen, 1) != -1)
accum += (elen >> table->i_sb->s_blocksize_bits);
brelse(epos.bh);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index ac628de69601..d0dda01620f0 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -526,7 +526,8 @@ static sector_t ufs_bmap(struct address_space *mapping, sector_t block)
}
const struct address_space_operations ufs_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = ufs_readpage,
.writepage = ufs_writepage,
.write_begin = ufs_write_begin,
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 00a01471ea05..23377c1baed9 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1443,7 +1443,7 @@ static struct inode *ufs_alloc_inode(struct super_block *sb)
{
struct ufs_inode_info *ei;
- ei = kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS);
+ ei = alloc_inode_sb(sb, ufs_inode_cachep, GFP_NOFS);
if (!ei)
return NULL;
diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile
index 0cc87423de82..0e51c0025a16 100644
--- a/fs/unicode/Makefile
+++ b/fs/unicode/Makefile
@@ -33,7 +33,7 @@ $(obj)/utf8data.c: $(obj)/mkutf8data $(filter %.txt, $(cmd_utf8data)) FORCE
else
$(obj)/utf8data.c: $(src)/utf8data.c_shipped FORCE
- $(call if_changed,shipped)
+ $(call if_changed,copy)
endif
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 8e03b3d3f5fa..aa0c47cb0d16 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -198,6 +198,9 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
struct uffd_msg msg;
msg_init(&msg);
msg.event = UFFD_EVENT_PAGEFAULT;
+
+ if (!(features & UFFD_FEATURE_EXACT_ADDRESS))
+ address &= PAGE_MASK;
msg.arg.pagefault.address = address;
/*
* These flags indicate why the userfault occurred:
@@ -482,7 +485,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
uwq.wq.private = current;
- uwq.msg = userfault_msg(vmf->address, vmf->flags, reason,
+ uwq.msg = userfault_msg(vmf->real_address, vmf->flags, reason,
ctx->features);
uwq.ctx = ctx;
uwq.waken = false;
diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c
index 864c2fad23be..d74e0d336995 100644
--- a/fs/vboxsf/file.c
+++ b/fs/vboxsf/file.c
@@ -354,7 +354,7 @@ out:
const struct address_space_operations vboxsf_reg_aops = {
.readpage = vboxsf_readpage,
.writepage = vboxsf_writepage,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .dirty_folio = filemap_dirty_folio,
.write_begin = simple_write_begin,
.write_end = vboxsf_write_end,
};
diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c
index 37dd3fe5b1e9..d2f6df69f611 100644
--- a/fs/vboxsf/super.c
+++ b/fs/vboxsf/super.c
@@ -241,7 +241,7 @@ static struct inode *vboxsf_alloc_inode(struct super_block *sb)
{
struct vboxsf_inode *sf_i;
- sf_i = kmem_cache_alloc(vboxsf_inode_cachep, GFP_NOFS);
+ sf_i = alloc_inode_sb(sb, vboxsf_inode_cachep, GFP_NOFS);
if (!sf_i)
return NULL;
diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c
index aec2ebf7d25a..e1db0f3f7e5e 100644
--- a/fs/vboxsf/utils.c
+++ b/fs/vboxsf/utils.c
@@ -9,6 +9,7 @@
#include <linux/namei.h>
#include <linux/nls.h>
#include <linux/sizes.h>
+#include <linux/pagemap.h>
#include <linux/vfs.h>
#include "vfsmod.h"
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index 0adb970f4e73..14e2fb49cff5 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Data verification functions, i.e. hooks for ->readpages()
+ * Data verification functions, i.e. hooks for ->readahead()
*
* Copyright 2019 Google LLC
*/
@@ -214,7 +214,7 @@ EXPORT_SYMBOL_GPL(fsverity_verify_page);
* that fail verification are set to the Error state. Verification is skipped
* for pages already in the Error state, e.g. due to fscrypt decryption failure.
*
- * This is a helper function for use by the ->readpages() method of filesystems
+ * This is a helper function for use by the ->readahead() method of filesystems
* that issue bios to read data directly into the page cache. Filesystems that
* populate the page cache without issuing bios (e.g. non block-based
* filesystems) must instead call fsverity_verify_page() directly on each page.
diff --git a/fs/xattr.c b/fs/xattr.c
index 5c8c5175b385..998045165916 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -569,7 +569,8 @@ setxattr(struct user_namespace *mnt_userns, struct dentry *d,
}
if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
(strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
- posix_acl_fix_xattr_from_user(mnt_userns, kvalue, size);
+ posix_acl_fix_xattr_from_user(mnt_userns, d_inode(d),
+ kvalue, size);
}
error = vfs_setxattr(mnt_userns, d, kname, kvalue, size, flags);
@@ -667,7 +668,8 @@ getxattr(struct user_namespace *mnt_userns, struct dentry *d,
if (error > 0) {
if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
(strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
- posix_acl_fix_xattr_to_user(mnt_userns, kvalue, error);
+ posix_acl_fix_xattr_to_user(mnt_userns, d_inode(d),
+ kvalue, error);
if (size && copy_to_user(value, kvalue, error))
error = -EFAULT;
} else if (error == -ERANGE && size >= XATTR_SIZE_MAX) {
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 353e53b892e6..b52ed339727f 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -82,6 +82,24 @@ xfs_prealloc_blocks(
}
/*
+ * The number of blocks per AG that we withhold from xfs_mod_fdblocks to
+ * guarantee that we can refill the AGFL prior to allocating space in a nearly
+ * full AG. Although the the space described by the free space btrees, the
+ * blocks used by the freesp btrees themselves, and the blocks owned by the
+ * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk
+ * free space in the AG drop so low that the free space btrees cannot refill an
+ * empty AGFL up to the minimum level. Rather than grind through empty AGs
+ * until the fs goes down, we subtract this many AG blocks from the incore
+ * fdblocks to ensure user allocation does not overcommit the space the
+ * filesystem needs for the AGFLs. The rmap btree uses a per-AG reservation to
+ * withhold space from xfs_mod_fdblocks, so we do not account for that here.
+ */
+#define XFS_ALLOCBT_AGFL_RESERVE 4
+
+/*
+ * Compute the number of blocks that we set aside to guarantee the ability to
+ * refill the AGFL and handle a full bmap btree split.
+ *
* In order to avoid ENOSPC-related deadlock caused by out-of-order locking of
* AGF buffer (PV 947395), we place constraints on the relationship among
* actual allocations for data blocks, freelist blocks, and potential file data
@@ -93,14 +111,14 @@ xfs_prealloc_blocks(
* extents need to be actually allocated. To get around this, we explicitly set
* aside a few blocks which will not be reserved in delayed allocation.
*
- * We need to reserve 4 fsbs _per AG_ for the freelist and 4 more to handle a
- * potential split of the file's bmap btree.
+ * For each AG, we need to reserve enough blocks to replenish a totally empty
+ * AGFL and 4 more to handle a potential split of the file's bmap btree.
*/
unsigned int
xfs_alloc_set_aside(
struct xfs_mount *mp)
{
- return mp->m_sb.sb_agcount * (XFS_ALLOC_AGFL_RESERVE + 4);
+ return mp->m_sb.sb_agcount * (XFS_ALLOCBT_AGFL_RESERVE + 4);
}
/*
@@ -124,12 +142,12 @@ xfs_alloc_ag_max_usable(
unsigned int blocks;
blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */
- blocks += XFS_ALLOC_AGFL_RESERVE;
+ blocks += XFS_ALLOCBT_AGFL_RESERVE;
blocks += 3; /* AGF, AGI btree root blocks */
if (xfs_has_finobt(mp))
blocks++; /* finobt root block */
if (xfs_has_rmapbt(mp))
- blocks++; /* rmap root block */
+ blocks++; /* rmap root block */
if (xfs_has_reflink(mp))
blocks++; /* refcount root block */
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 1c14a0b1abea..d4c057b764f9 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -88,7 +88,6 @@ typedef struct xfs_alloc_arg {
#define XFS_ALLOC_NOBUSY (1 << 2)/* Busy extents not allowed */
/* freespace limit calculations */
-#define XFS_ALLOC_AGFL_RESERVE 4
unsigned int xfs_alloc_set_aside(struct xfs_mount *mp);
unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp);
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index f18a875f51c6..c1500b238520 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -2818,7 +2818,7 @@ xfs_btree_split_worker(
* in any way.
*/
if (args->kswapd)
- new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
+ new_pflags |= PF_MEMALLOC | PF_KSWAPD;
current_set_flags_nested(&pflags, new_pflags);
xfs_trans_set_context(args->cur->bc_tp);
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 50546eadaae2..5f1e4799e8fa 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -19,7 +19,11 @@
#include "xfs_error.h"
#include "xfs_trace.h"
-struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR };
+const struct xfs_name xfs_name_dotdot = {
+ .name = (const unsigned char *)"..",
+ .len = 2,
+ .type = XFS_DIR3_FT_DIR,
+};
/*
* Convert inode mode to directory entry filetype
@@ -54,10 +58,10 @@ xfs_mode_to_ftype(
*/
xfs_dahash_t
xfs_ascii_ci_hashname(
- struct xfs_name *name)
+ const struct xfs_name *name)
{
- xfs_dahash_t hash;
- int i;
+ xfs_dahash_t hash;
+ int i;
for (i = 0, hash = 0; i < name->len; i++)
hash = tolower(name->name[i]) ^ rol32(hash, 7);
@@ -243,7 +247,7 @@ int
xfs_dir_createname(
struct xfs_trans *tp,
struct xfs_inode *dp,
- struct xfs_name *name,
+ const struct xfs_name *name,
xfs_ino_t inum, /* new entry inode number */
xfs_extlen_t total) /* bmap's total block count */
{
@@ -337,16 +341,16 @@ xfs_dir_cilookup_result(
int
xfs_dir_lookup(
- xfs_trans_t *tp,
- xfs_inode_t *dp,
- struct xfs_name *name,
- xfs_ino_t *inum, /* out: inode number */
- struct xfs_name *ci_name) /* out: actual name if CI match */
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ const struct xfs_name *name,
+ xfs_ino_t *inum, /* out: inode number */
+ struct xfs_name *ci_name) /* out: actual name if CI match */
{
- struct xfs_da_args *args;
- int rval;
- int v; /* type-checking value */
- int lock_mode;
+ struct xfs_da_args *args;
+ int rval;
+ int v; /* type-checking value */
+ int lock_mode;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
XFS_STATS_INC(dp->i_mount, xs_dir_lookup);
@@ -475,7 +479,7 @@ int
xfs_dir_replace(
struct xfs_trans *tp,
struct xfs_inode *dp,
- struct xfs_name *name, /* name of entry to replace */
+ const struct xfs_name *name, /* name of entry to replace */
xfs_ino_t inum, /* new inode number */
xfs_extlen_t total) /* bmap's total block count */
{
@@ -728,7 +732,7 @@ xfs_dir2_namecheck(
xfs_dahash_t
xfs_dir2_hashname(
struct xfs_mount *mp,
- struct xfs_name *name)
+ const struct xfs_name *name)
{
if (unlikely(xfs_has_asciici(mp)))
return xfs_ascii_ci_hashname(name);
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index d03e6098ded9..b6df3c34b26a 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -21,7 +21,7 @@ struct xfs_dir2_data_unused;
struct xfs_dir3_icfree_hdr;
struct xfs_dir3_icleaf_hdr;
-extern struct xfs_name xfs_name_dotdot;
+extern const struct xfs_name xfs_name_dotdot;
/*
* Convert inode mode to directory entry filetype
@@ -39,16 +39,16 @@ extern int xfs_dir_isempty(struct xfs_inode *dp);
extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_inode *pdp);
extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,
- struct xfs_name *name, xfs_ino_t inum,
+ const struct xfs_name *name, xfs_ino_t inum,
xfs_extlen_t tot);
extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
- struct xfs_name *name, xfs_ino_t *inum,
+ const struct xfs_name *name, xfs_ino_t *inum,
struct xfs_name *ci_name);
extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_name *name, xfs_ino_t ino,
xfs_extlen_t tot);
extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
- struct xfs_name *name, xfs_ino_t inum,
+ const struct xfs_name *name, xfs_ino_t inum,
xfs_extlen_t tot);
extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_name *name);
diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
index 711709a2aa53..7404a9ff1a92 100644
--- a/fs/xfs/libxfs/xfs_dir2_priv.h
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
@@ -40,7 +40,7 @@ struct xfs_dir3_icfree_hdr {
};
/* xfs_dir2.c */
-xfs_dahash_t xfs_ascii_ci_hashname(struct xfs_name *name);
+xfs_dahash_t xfs_ascii_ci_hashname(const struct xfs_name *name);
enum xfs_dacmp xfs_ascii_ci_compname(struct xfs_da_args *args,
const unsigned char *name, int len);
extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
@@ -201,7 +201,8 @@ xfs_dir2_data_entsize(
return round_up(len, XFS_DIR2_DATA_ALIGN);
}
-xfs_dahash_t xfs_dir2_hashname(struct xfs_mount *mp, struct xfs_name *name);
+xfs_dahash_t xfs_dir2_hashname(struct xfs_mount *mp,
+ const struct xfs_name *name);
enum xfs_dacmp xfs_dir2_compname(struct xfs_da_args *args,
const unsigned char *name, int len);
diff --git a/fs/xfs/scrub/attr.h b/fs/xfs/scrub/attr.h
index 1719e1c4da59..3590e10e3e62 100644
--- a/fs/xfs/scrub/attr.h
+++ b/fs/xfs/scrub/attr.h
@@ -24,7 +24,7 @@ struct xchk_xattr_buf {
* space bitmap follows immediately after; and we have a third buffer
* for storing intermediate bitmap results.
*/
- uint8_t buf[0];
+ uint8_t buf[];
};
/* A place to store attribute values. */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 9d6a67c7d227..90b7f4d127de 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -567,9 +567,9 @@ const struct address_space_operations xfs_address_space_operations = {
.readpage = xfs_vm_readpage,
.readahead = xfs_vm_readahead,
.writepages = xfs_vm_writepages,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .dirty_folio = filemap_dirty_folio,
.releasepage = iomap_releasepage,
- .invalidatepage = iomap_invalidatepage,
+ .invalidate_folio = iomap_invalidate_folio,
.bmap = xfs_vm_bmap,
.direct_IO = noop_direct_IO,
.migratepage = iomap_migrate_page,
@@ -581,7 +581,6 @@ const struct address_space_operations xfs_address_space_operations = {
const struct address_space_operations xfs_dax_aops = {
.writepages = xfs_dax_writepages,
.direct_IO = noop_direct_IO,
- .set_page_dirty = __set_page_dirty_no_writeback,
- .invalidatepage = noop_invalidatepage,
+ .dirty_folio = noop_dirty_folio,
.swap_activate = xfs_iomap_swapfile_activate,
};
diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c
index 667e297f59b1..ae4345b37621 100644
--- a/fs/xfs/xfs_bio_io.c
+++ b/fs/xfs/xfs_bio_io.c
@@ -9,41 +9,6 @@ static inline unsigned int bio_max_vecs(unsigned int count)
return bio_max_segs(howmany(count, PAGE_SIZE));
}
-static void
-xfs_flush_bdev_async_endio(
- struct bio *bio)
-{
- complete(bio->bi_private);
-}
-
-/*
- * Submit a request for an async cache flush to run. If the request queue does
- * not require flush operations, just skip it altogether. If the caller needs
- * to wait for the flush completion at a later point in time, they must supply a
- * valid completion. This will be signalled when the flush completes. The
- * caller never sees the bio that is issued here.
- */
-void
-xfs_flush_bdev_async(
- struct bio *bio,
- struct block_device *bdev,
- struct completion *done)
-{
- struct request_queue *q = bdev->bd_disk->queue;
-
- if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
- complete(done);
- return;
- }
-
- bio_init(bio, NULL, 0);
- bio_set_dev(bio, bdev);
- bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
- bio->bi_private = done;
- bio->bi_end_io = xfs_flush_bdev_async_endio;
-
- submit_bio(bio);
-}
int
xfs_rw_bdev(
struct block_device *bdev,
@@ -61,10 +26,9 @@ xfs_rw_bdev(
if (is_vmalloc && op == REQ_OP_WRITE)
flush_kernel_vmap_range(data, count);
- bio = bio_alloc(GFP_KERNEL, bio_max_vecs(left));
- bio_set_dev(bio, bdev);
+ bio = bio_alloc(bdev, bio_max_vecs(left), op | REQ_META | REQ_SYNC,
+ GFP_KERNEL);
bio->bi_iter.bi_sector = sector;
- bio->bi_opf = op | REQ_META | REQ_SYNC;
do {
struct page *page = kmem_to_page(data);
@@ -74,10 +38,9 @@ xfs_rw_bdev(
while (bio_add_page(bio, page, len, off) != len) {
struct bio *prev = bio;
- bio = bio_alloc(GFP_KERNEL, bio_max_vecs(left));
- bio_copy_dev(bio, prev);
+ bio = bio_alloc(prev->bi_bdev, bio_max_vecs(left),
+ prev->bi_opf, GFP_KERNEL);
bio->bi_iter.bi_sector = bio_end_sector(prev);
- bio->bi_opf = prev->bi_opf;
bio_chain(prev, bio);
submit_bio(prev);
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index e1f4d7d5a011..761dde155099 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -463,7 +463,7 @@ xfs_bui_item_recover(
struct xfs_bui_log_item *buip = BUI_ITEM(lip);
struct xfs_trans *tp;
struct xfs_inode *ip = NULL;
- struct xfs_mount *mp = lip->li_mountp;
+ struct xfs_mount *mp = lip->li_log->l_mp;
struct xfs_map_extent *bmap;
struct xfs_bud_log_item *budp;
xfs_filblks_t count;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index b45e0d50a405..bf4e60871068 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -14,6 +14,7 @@
#include "xfs_trace.h"
#include "xfs_log.h"
#include "xfs_log_recover.h"
+#include "xfs_log_priv.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_errortag.h"
@@ -405,7 +406,7 @@ xfs_buf_alloc_pages(
STATIC int
_xfs_buf_map_pages(
struct xfs_buf *bp,
- uint flags)
+ xfs_buf_flags_t flags)
{
ASSERT(bp->b_flags & _XBF_PAGES);
if (bp->b_page_count == 1) {
@@ -813,7 +814,15 @@ xfs_buf_read_map(
* buffer.
*/
if (error) {
- if (!xfs_is_shutdown(target->bt_mount))
+ /*
+ * Check against log shutdown for error reporting because
+ * metadata writeback may require a read first and we need to
+ * report errors in metadata writeback until the log is shut
+ * down. High level transaction read functions already check
+ * against mount shutdown, anyway, so we only need to be
+ * concerned about low level IO interactions here.
+ */
+ if (!xlog_is_shutdown(target->bt_mount->m_log))
xfs_buf_ioerror_alert(bp, fa);
bp->b_flags &= ~XBF_DONE;
@@ -843,9 +852,6 @@ xfs_buf_readahead_map(
{
struct xfs_buf *bp;
- if (bdi_read_congested(target->bt_bdev->bd_disk->bdi))
- return;
-
xfs_buf_read_map(target, map, nmaps,
XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops,
__this_address);
@@ -862,7 +868,7 @@ xfs_buf_read_uncached(
struct xfs_buftarg *target,
xfs_daddr_t daddr,
size_t numblks,
- int flags,
+ xfs_buf_flags_t flags,
struct xfs_buf **bpp,
const struct xfs_buf_ops *ops)
{
@@ -897,7 +903,7 @@ int
xfs_buf_get_uncached(
struct xfs_buftarg *target,
size_t numblks,
- int flags,
+ xfs_buf_flags_t flags,
struct xfs_buf **bpp)
{
int error;
@@ -1177,10 +1183,10 @@ xfs_buf_ioend_handle_error(
struct xfs_error_cfg *cfg;
/*
- * If we've already decided to shutdown the filesystem because of I/O
- * errors, there's no point in giving this a retry.
+ * If we've already shutdown the journal because of I/O errors, there's
+ * no point in giving this a retry.
*/
- if (xfs_is_shutdown(mp))
+ if (xlog_is_shutdown(mp->m_log))
goto out_stale;
xfs_buf_ioerror_alert_ratelimited(bp);
@@ -1440,12 +1446,10 @@ next_chunk:
atomic_inc(&bp->b_io_remaining);
nr_pages = bio_max_segs(total_nr_pages);
- bio = bio_alloc(GFP_NOIO, nr_pages);
- bio_set_dev(bio, bp->b_target->bt_bdev);
+ bio = bio_alloc(bp->b_target->bt_bdev, nr_pages, op, GFP_NOIO);
bio->bi_iter.bi_sector = sector;
bio->bi_end_io = xfs_buf_bio_end_io;
bio->bi_private = bp;
- bio->bi_opf = op;
for (; size && nr_pages; nr_pages--, page_index++) {
int rbytes, nbytes = PAGE_SIZE - offset;
@@ -1593,8 +1597,23 @@ __xfs_buf_submit(
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
- /* on shutdown we stale and complete the buffer immediately */
- if (xfs_is_shutdown(bp->b_mount)) {
+ /*
+ * On log shutdown we stale and complete the buffer immediately. We can
+ * be called to read the superblock before the log has been set up, so
+ * be careful checking the log state.
+ *
+ * Checking the mount shutdown state here can result in the log tail
+ * moving inappropriately on disk as the log may not yet be shut down.
+ * i.e. failing this buffer on mount shutdown can remove it from the AIL
+ * and move the tail of the log forwards without having written this
+ * buffer to disk. This corrupts the log tail state in memory, and
+ * because the log may not be shut down yet, it can then be propagated
+ * to disk before the log is shutdown. Hence we check log shutdown
+ * state here rather than mount state to avoid corrupting the log tail
+ * on shutdown.
+ */
+ if (bp->b_mount->m_log &&
+ xlog_is_shutdown(bp->b_mount->m_log)) {
xfs_buf_ioend_fail(bp);
return -EIO;
}
@@ -1808,10 +1827,10 @@ xfs_buftarg_drain(
* If one or more failed buffers were freed, that means dirty metadata
* was thrown away. This should only ever happen after I/O completion
* handling has elevated I/O error(s) to permanent failures and shuts
- * down the fs.
+ * down the journal.
*/
if (write_fail) {
- ASSERT(xfs_is_shutdown(btp->bt_mount));
+ ASSERT(xlog_is_shutdown(btp->bt_mount->m_log));
xfs_alert(btp->bt_mount,
"Please run xfs_repair to determine the extent of the problem.");
}
@@ -2094,12 +2113,13 @@ xfs_buf_delwri_submit_buffers(
blk_start_plug(&plug);
list_for_each_entry_safe(bp, n, buffer_list, b_list) {
if (!wait_list) {
+ if (!xfs_buf_trylock(bp))
+ continue;
if (xfs_buf_ispinned(bp)) {
+ xfs_buf_unlock(bp);
pinned++;
continue;
}
- if (!xfs_buf_trylock(bp))
- continue;
} else {
xfs_buf_lock(bp);
}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index edcb6254fa6a..1ee3056ff9cf 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -22,28 +22,28 @@ struct xfs_buf;
#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL))
-#define XBF_READ (1 << 0) /* buffer intended for reading from device */
-#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */
-#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */
-#define XBF_NO_IOACCT (1 << 3) /* bypass I/O accounting (non-LRU bufs) */
-#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */
-#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
-#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */
-#define XBF_WRITE_FAIL (1 << 7) /* async writes have failed on this buffer */
+#define XBF_READ (1u << 0) /* buffer intended for reading from device */
+#define XBF_WRITE (1u << 1) /* buffer intended for writing to device */
+#define XBF_READ_AHEAD (1u << 2) /* asynchronous read-ahead */
+#define XBF_NO_IOACCT (1u << 3) /* bypass I/O accounting (non-LRU bufs) */
+#define XBF_ASYNC (1u << 4) /* initiator will not wait for completion */
+#define XBF_DONE (1u << 5) /* all pages in the buffer uptodate */
+#define XBF_STALE (1u << 6) /* buffer has been staled, do not find it */
+#define XBF_WRITE_FAIL (1u << 7) /* async writes have failed on this buffer */
/* buffer type flags for write callbacks */
-#define _XBF_INODES (1 << 16)/* inode buffer */
-#define _XBF_DQUOTS (1 << 17)/* dquot buffer */
-#define _XBF_LOGRECOVERY (1 << 18)/* log recovery buffer */
+#define _XBF_INODES (1u << 16)/* inode buffer */
+#define _XBF_DQUOTS (1u << 17)/* dquot buffer */
+#define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */
/* flags used only internally */
-#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */
-#define _XBF_KMEM (1 << 21)/* backed by heap memory */
-#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */
+#define _XBF_PAGES (1u << 20)/* backed by refcounted pages */
+#define _XBF_KMEM (1u << 21)/* backed by heap memory */
+#define _XBF_DELWRI_Q (1u << 22)/* buffer on a delwri queue */
/* flags used only as arguments to access routines */
-#define XBF_TRYLOCK (1 << 30)/* lock requested, but do not wait */
-#define XBF_UNMAPPED (1 << 31)/* do not map the buffer */
+#define XBF_TRYLOCK (1u << 30)/* lock requested, but do not wait */
+#define XBF_UNMAPPED (1u << 31)/* do not map the buffer */
typedef unsigned int xfs_buf_flags_t;
@@ -58,7 +58,7 @@ typedef unsigned int xfs_buf_flags_t;
{ XBF_WRITE_FAIL, "WRITE_FAIL" }, \
{ _XBF_INODES, "INODES" }, \
{ _XBF_DQUOTS, "DQUOTS" }, \
- { _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \
+ { _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \
{ _XBF_PAGES, "PAGES" }, \
{ _XBF_KMEM, "KMEM" }, \
{ _XBF_DELWRI_Q, "DELWRI_Q" }, \
@@ -247,11 +247,11 @@ xfs_buf_readahead(
return xfs_buf_readahead_map(target, &map, 1, ops);
}
-int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags,
- struct xfs_buf **bpp);
+int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
+ xfs_buf_flags_t flags, struct xfs_buf **bpp);
int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr,
- size_t numblks, int flags, struct xfs_buf **bpp,
- const struct xfs_buf_ops *ops);
+ size_t numblks, xfs_buf_flags_t flags, struct xfs_buf **bpp,
+ const struct xfs_buf_ops *ops);
int _xfs_buf_read(struct xfs_buf *bp, xfs_buf_flags_t flags);
void xfs_buf_hold(struct xfs_buf *bp);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index a7a8e4528881..522d450a94b1 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -21,6 +21,7 @@
#include "xfs_dquot.h"
#include "xfs_trace.h"
#include "xfs_log.h"
+#include "xfs_log_priv.h"
struct kmem_cache *xfs_buf_item_cache;
@@ -428,7 +429,7 @@ xfs_buf_item_format(
* occurs during recovery.
*/
if (bip->bli_flags & XFS_BLI_INODE_BUF) {
- if (xfs_has_v3inodes(lip->li_mountp) ||
+ if (xfs_has_v3inodes(lip->li_log->l_mp) ||
!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
xfs_log_item_in_current_chkpt(lip)))
bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;
@@ -616,7 +617,7 @@ xfs_buf_item_put(
* that case, the bli is freed on buffer writeback completion.
*/
aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) ||
- xfs_is_shutdown(lip->li_mountp);
+ xlog_is_shutdown(lip->li_log);
dirty = bip->bli_flags & XFS_BLI_DIRTY;
if (dirty && !aborted)
return false;
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 47ef9c9c5c17..0e50f2c9348e 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -604,7 +604,7 @@ xfs_efi_item_recover(
struct list_head *capture_list)
{
struct xfs_efi_log_item *efip = EFI_ITEM(lip);
- struct xfs_mount *mp = lip->li_mountp;
+ struct xfs_mount *mp = lip->li_log->l_mp;
struct xfs_efd_log_item *efdp;
struct xfs_trans *tp;
struct xfs_extent *extp;
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 48287caad28b..10e1cb71439e 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -864,8 +864,8 @@ xfs_getfsmap(
!xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[1]))
return -EINVAL;
- use_rmap = capable(CAP_SYS_ADMIN) &&
- xfs_has_rmapbt(mp);
+ use_rmap = xfs_has_rmapbt(mp) &&
+ has_capability_noaudit(current, CAP_SYS_ADMIN);
head->fmh_entries = 0;
/* Set up our device handlers. */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 33e26690a8c4..68f74549fa22 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -17,6 +17,7 @@
#include "xfs_fsops.h"
#include "xfs_trans_space.h"
#include "xfs_log.h"
+#include "xfs_log_priv.h"
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_trace.h"
@@ -347,7 +348,7 @@ xfs_fs_counts(
cnt->allocino = percpu_counter_read_positive(&mp->m_icount);
cnt->freeino = percpu_counter_read_positive(&mp->m_ifree);
cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
- mp->m_alloc_set_aside;
+ xfs_fdblocks_unavailable(mp);
spin_lock(&mp->m_sb_lock);
cnt->freertx = mp->m_sb.sb_frextents;
@@ -430,46 +431,36 @@ xfs_reserve_blocks(
* If the request is larger than the current reservation, reserve the
* blocks before we update the reserve counters. Sample m_fdblocks and
* perform a partial reservation if the request exceeds free space.
+ *
+ * The code below estimates how many blocks it can request from
+ * fdblocks to stash in the reserve pool. This is a classic TOCTOU
+ * race since fdblocks updates are not always coordinated via
+ * m_sb_lock. Set the reserve size even if there's not enough free
+ * space to fill it because mod_fdblocks will refill an undersized
+ * reserve when it can.
*/
- error = -ENOSPC;
- do {
- free = percpu_counter_sum(&mp->m_fdblocks) -
- mp->m_alloc_set_aside;
- if (free <= 0)
- break;
-
- delta = request - mp->m_resblks;
- lcounter = free - delta;
- if (lcounter < 0)
- /* We can't satisfy the request, just get what we can */
- fdblks_delta = free;
- else
- fdblks_delta = delta;
-
+ free = percpu_counter_sum(&mp->m_fdblocks) -
+ xfs_fdblocks_unavailable(mp);
+ delta = request - mp->m_resblks;
+ mp->m_resblks = request;
+ if (delta > 0 && free > 0) {
/*
* We'll either succeed in getting space from the free block
- * count or we'll get an ENOSPC. If we get a ENOSPC, it means
- * things changed while we were calculating fdblks_delta and so
- * we should try again to see if there is anything left to
- * reserve.
+ * count or we'll get an ENOSPC. Don't set the reserved flag
+ * here - we don't want to reserve the extra reserve blocks
+ * from the reserve.
*
- * Don't set the reserved flag here - we don't want to reserve
- * the extra reserve blocks from the reserve.....
+ * The desired reserve size can change after we drop the lock.
+ * Use mod_fdblocks to put the space into the reserve or into
+ * fdblocks as appropriate.
*/
+ fdblks_delta = min(free, delta);
spin_unlock(&mp->m_sb_lock);
error = xfs_mod_fdblocks(mp, -fdblks_delta, 0);
+ if (!error)
+ xfs_mod_fdblocks(mp, fdblks_delta, 0);
spin_lock(&mp->m_sb_lock);
- } while (error == -ENOSPC);
-
- /*
- * Update the reserve counters if blocks have been successfully
- * allocated.
- */
- if (!error && fdblks_delta) {
- mp->m_resblks += fdblks_delta;
- mp->m_resblks_avail += fdblks_delta;
}
-
out:
if (outval) {
outval->resblks = mp->m_resblks;
@@ -528,8 +519,11 @@ xfs_do_force_shutdown(
int tag;
const char *why;
- if (test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &mp->m_opstate))
+
+ if (test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &mp->m_opstate)) {
+ xlog_shutdown_wait(mp->m_log);
return;
+ }
if (mp->m_sb_bp)
mp->m_sb_bp->b_flags |= XBF_DONE;
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 9644f938990c..bffd6eb0b298 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -23,6 +23,7 @@
#include "xfs_reflink.h"
#include "xfs_ialloc.h"
#include "xfs_ag.h"
+#include "xfs_log_priv.h"
#include <linux/iversion.h>
@@ -77,7 +78,7 @@ xfs_inode_alloc(
* XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL
* and return NULL here on ENOMEM.
*/
- ip = kmem_cache_alloc(xfs_inode_cache, GFP_KERNEL | __GFP_NOFAIL);
+ ip = alloc_inode_sb(mp->m_super, xfs_inode_cache, GFP_KERNEL | __GFP_NOFAIL);
if (inode_init_always(mp->m_super, VFS_I(ip))) {
kmem_cache_free(xfs_inode_cache, ip);
@@ -873,9 +874,16 @@ xfs_reclaim_inode(
if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING))
goto out_iunlock;
- if (xfs_is_shutdown(ip->i_mount)) {
+ /*
+ * Check for log shutdown because aborting the inode can move the log
+ * tail and corrupt in memory state. This is fine if the log is shut
+ * down, but if the log is still active and only the mount is shut down
+ * then the in-memory log tail movement caused by the abort can be
+ * incorrectly propagated to disk.
+ */
+ if (xlog_is_shutdown(ip->i_mount->m_log)) {
xfs_iunpin_wait(ip);
- xfs_iflush_abort(ip);
+ xfs_iflush_shutdown_abort(ip);
goto reclaim;
}
if (xfs_ipincount(ip))
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 04bf467b1090..39ae53efb3ab 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -35,6 +35,7 @@
#include "xfs_bmap_btree.h"
#include "xfs_reflink.h"
#include "xfs_ag.h"
+#include "xfs_log_priv.h"
struct kmem_cache *xfs_inode_cache;
@@ -658,9 +659,9 @@ xfs_ip2xflags(
*/
int
xfs_lookup(
- xfs_inode_t *dp,
- struct xfs_name *name,
- xfs_inode_t **ipp,
+ struct xfs_inode *dp,
+ const struct xfs_name *name,
+ struct xfs_inode **ipp,
struct xfs_name *ci_name)
{
xfs_ino_t inum;
@@ -1217,7 +1218,7 @@ xfs_link(
{
xfs_mount_t *mp = tdp->i_mount;
xfs_trans_t *tp;
- int error;
+ int error, nospace_error = 0;
int resblks;
trace_xfs_link(tdp, target_name);
@@ -1236,19 +1237,11 @@ xfs_link(
goto std_return;
resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp);
- if (error == -ENOSPC) {
- resblks = 0;
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp);
- }
+ error = xfs_trans_alloc_dir(tdp, &M_RES(mp)->tr_link, sip, &resblks,
+ &tp, &nospace_error);
if (error)
goto std_return;
- xfs_lock_two_inodes(sip, XFS_ILOCK_EXCL, tdp, XFS_ILOCK_EXCL);
-
- xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
-
error = xfs_iext_count_may_overflow(tdp, XFS_DATA_FORK,
XFS_IEXT_DIR_MANIP_CNT(mp));
if (error)
@@ -1306,6 +1299,8 @@ xfs_link(
error_return:
xfs_trans_cancel(tp);
std_return:
+ if (error == -ENOSPC && nospace_error)
+ error = nospace_error;
return error;
}
@@ -2599,14 +2594,13 @@ xfs_ifree_cluster(
}
/*
- * This is called to return an inode to the inode free list.
- * The inode should already be truncated to 0 length and have
- * no pages associated with it. This routine also assumes that
- * the inode is already a part of the transaction.
+ * This is called to return an inode to the inode free list. The inode should
+ * already be truncated to 0 length and have no pages associated with it. This
+ * routine also assumes that the inode is already a part of the transaction.
*
- * The on-disk copy of the inode will have been added to the list
- * of unlinked inodes in the AGI. We need to remove the inode from
- * that list atomically with respect to freeing it here.
+ * The on-disk copy of the inode will have been added to the list of unlinked
+ * inodes in the AGI. We need to remove the inode from that list atomically with
+ * respect to freeing it here.
*/
int
xfs_ifree(
@@ -2628,13 +2622,16 @@ xfs_ifree(
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
/*
- * Pull the on-disk inode from the AGI unlinked list.
+ * Free the inode first so that we guarantee that the AGI lock is going
+ * to be taken before we remove the inode from the unlinked list. This
+ * makes the AGI lock -> unlinked list modification order the same as
+ * used in O_TMPFILE creation.
*/
- error = xfs_iunlink_remove(tp, pag, ip);
+ error = xfs_difree(tp, pag, ip->i_ino, &xic);
if (error)
- goto out;
+ return error;
- error = xfs_difree(tp, pag, ip->i_ino, &xic);
+ error = xfs_iunlink_remove(tp, pag, ip);
if (error)
goto out;
@@ -2755,6 +2752,7 @@ xfs_remove(
xfs_mount_t *mp = dp->i_mount;
xfs_trans_t *tp = NULL;
int is_dir = S_ISDIR(VFS_I(ip)->i_mode);
+ int dontcare;
int error = 0;
uint resblks;
@@ -2772,31 +2770,24 @@ xfs_remove(
goto std_return;
/*
- * We try to get the real space reservation first,
- * allowing for directory btree deletion(s) implying
- * possible bmap insert(s). If we can't get the space
- * reservation then we use 0 instead, and avoid the bmap
- * btree insert(s) in the directory code by, if the bmap
- * insert tries to happen, instead trimming the LAST
- * block from the directory.
+ * We try to get the real space reservation first, allowing for
+ * directory btree deletion(s) implying possible bmap insert(s). If we
+ * can't get the space reservation then we use 0 instead, and avoid the
+ * bmap btree insert(s) in the directory code by, if the bmap insert
+ * tries to happen, instead trimming the LAST block from the directory.
+ *
+ * Ignore EDQUOT and ENOSPC being returned via nospace_error because
+ * the directory code can handle a reservationless update and we don't
+ * want to prevent a user from trying to free space by deleting things.
*/
resblks = XFS_REMOVE_SPACE_RES(mp);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, resblks, 0, 0, &tp);
- if (error == -ENOSPC) {
- resblks = 0;
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, 0, 0, 0,
- &tp);
- }
+ error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, ip, &resblks,
+ &tp, &dontcare);
if (error) {
ASSERT(error != -ENOSPC);
goto std_return;
}
- xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL);
-
- xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-
/*
* If we're removing a directory perform some additional validation.
*/
@@ -3109,7 +3100,8 @@ xfs_rename(
bool new_parent = (src_dp != target_dp);
bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
int spaceres;
- int error;
+ bool retried = false;
+ int error, nospace_error = 0;
trace_xfs_rename(src_dp, target_dp, src_name, target_name);
@@ -3133,9 +3125,12 @@ xfs_rename(
xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
inodes, &num_inodes);
+retry:
+ nospace_error = 0;
spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp);
if (error == -ENOSPC) {
+ nospace_error = error;
spaceres = 0;
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0,
&tp);
@@ -3190,6 +3185,31 @@ xfs_rename(
spaceres);
/*
+ * Try to reserve quota to handle an expansion of the target directory.
+ * We'll allow the rename to continue in reservationless mode if we hit
+ * a space usage constraint. If we trigger reservationless mode, save
+ * the errno if there isn't any free space in the target directory.
+ */
+ if (spaceres != 0) {
+ error = xfs_trans_reserve_quota_nblks(tp, target_dp, spaceres,
+ 0, false);
+ if (error == -EDQUOT || error == -ENOSPC) {
+ if (!retried) {
+ xfs_trans_cancel(tp);
+ xfs_blockgc_free_quota(target_dp, 0);
+ retried = true;
+ goto retry;
+ }
+
+ nospace_error = error;
+ spaceres = 0;
+ error = 0;
+ }
+ if (error)
+ goto out_trans_cancel;
+ }
+
+ /*
* Check for expected errors before we dirty the transaction
* so we can return an error without a transaction abort.
*
@@ -3435,6 +3455,8 @@ out_trans_cancel:
out_release_wip:
if (wip)
xfs_irele(wip);
+ if (error == -ENOSPC && nospace_error)
+ error = nospace_error;
return error;
}
@@ -3611,7 +3633,7 @@ xfs_iflush_cluster(
/*
* We must use the safe variant here as on shutdown xfs_iflush_abort()
- * can remove itself from the list.
+ * will remove itself from the list.
*/
list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
iip = (struct xfs_inode_log_item *)lip;
@@ -3659,7 +3681,7 @@ xfs_iflush_cluster(
* AIL, leaving a dirty/unpinned inode attached to the buffer
* that otherwise looks like it should be flushed.
*/
- if (xfs_is_shutdown(mp)) {
+ if (xlog_is_shutdown(mp->m_log)) {
xfs_iunpin_wait(ip);
xfs_iflush_abort(ip);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -3685,9 +3707,19 @@ xfs_iflush_cluster(
}
if (error) {
+ /*
+ * Shutdown first so we kill the log before we release this
+ * buffer. If it is an INODE_ALLOC buffer and pins the tail
+ * of the log, failing it before the _log_ is shut down can
+ * result in the log tail being moved forward in the journal
+ * on disk because log writes can still be taking place. Hence
+ * unpinning the tail will allow the ICREATE intent to be
+ * removed from the log an recovery will fail with uninitialised
+ * inode cluster buffers.
+ */
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
bp->b_flags |= XBF_ASYNC;
xfs_buf_ioend_fail(bp);
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
return error;
}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index b7e8f14d9fca..740ab13d1aa2 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -402,7 +402,7 @@ enum layout_break_reason {
int xfs_release(struct xfs_inode *ip);
void xfs_inactive(struct xfs_inode *ip);
-int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
+int xfs_lookup(struct xfs_inode *dp, const struct xfs_name *name,
struct xfs_inode **ipp, struct xfs_name *ci_name);
int xfs_create(struct user_namespace *mnt_userns,
struct xfs_inode *dp, struct xfs_name *name,
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 90d8e591baf8..9e6ef55cf29e 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -17,6 +17,7 @@
#include "xfs_trans_priv.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
+#include "xfs_log_priv.h"
#include "xfs_error.h"
#include <linux/iversion.h>
@@ -543,10 +544,17 @@ xfs_inode_item_push(
uint rval = XFS_ITEM_SUCCESS;
int error;
- ASSERT(iip->ili_item.li_buf);
+ if (!bp || (ip->i_flags & XFS_ISTALE)) {
+ /*
+ * Inode item/buffer is being being aborted due to cluster
+ * buffer deletion. Trigger a log force to have that operation
+ * completed and items removed from the AIL before the next push
+ * attempt.
+ */
+ return XFS_ITEM_PINNED;
+ }
- if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp) ||
- (ip->i_flags & XFS_ISTALE))
+ if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp))
return XFS_ITEM_PINNED;
if (xfs_iflags_test(ip, XFS_IFLUSHING))
@@ -720,6 +728,17 @@ xfs_iflush_ail_updates(
if (INODE_ITEM(lip)->ili_flush_lsn != lip->li_lsn)
continue;
+ /*
+ * dgc: Not sure how this happens, but it happens very
+ * occassionaly via generic/388. xfs_iflush_abort() also
+ * silently handles this same "under writeback but not in AIL at
+ * shutdown" condition via xfs_trans_ail_delete().
+ */
+ if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) {
+ ASSERT(xlog_is_shutdown(lip->li_log));
+ continue;
+ }
+
lsn = xfs_ail_delete_one(ailp, lip);
if (!tail_lsn && lsn)
tail_lsn = lsn;
@@ -822,46 +841,143 @@ xfs_buf_inode_io_fail(
}
/*
- * This is the inode flushing abort routine. It is called when
- * the filesystem is shutting down to clean up the inode state. It is
- * responsible for removing the inode item from the AIL if it has not been
- * re-logged and clearing the inode's flush state.
+ * Clear the inode logging fields so no more flushes are attempted. If we are
+ * on a buffer list, it is now safe to remove it because the buffer is
+ * guaranteed to be locked. The caller will drop the reference to the buffer
+ * the log item held.
+ */
+static void
+xfs_iflush_abort_clean(
+ struct xfs_inode_log_item *iip)
+{
+ iip->ili_last_fields = 0;
+ iip->ili_fields = 0;
+ iip->ili_fsync_fields = 0;
+ iip->ili_flush_lsn = 0;
+ iip->ili_item.li_buf = NULL;
+ list_del_init(&iip->ili_item.li_bio_list);
+}
+
+/*
+ * Abort flushing the inode from a context holding the cluster buffer locked.
+ *
+ * This is the normal runtime method of aborting writeback of an inode that is
+ * attached to a cluster buffer. It occurs when the inode and the backing
+ * cluster buffer have been freed (i.e. inode is XFS_ISTALE), or when cluster
+ * flushing or buffer IO completion encounters a log shutdown situation.
+ *
+ * If we need to abort inode writeback and we don't already hold the buffer
+ * locked, call xfs_iflush_shutdown_abort() instead as this should only ever be
+ * necessary in a shutdown situation.
*/
void
xfs_iflush_abort(
struct xfs_inode *ip)
{
struct xfs_inode_log_item *iip = ip->i_itemp;
- struct xfs_buf *bp = NULL;
+ struct xfs_buf *bp;
- if (iip) {
- /*
- * Clear the failed bit before removing the item from the AIL so
- * xfs_trans_ail_delete() doesn't try to clear and release the
- * buffer attached to the log item before we are done with it.
- */
- clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags);
- xfs_trans_ail_delete(&iip->ili_item, 0);
+ if (!iip) {
+ /* clean inode, nothing to do */
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
+ return;
+ }
+
+ /*
+ * Remove the inode item from the AIL before we clear its internal
+ * state. Whilst the inode is in the AIL, it should have a valid buffer
+ * pointer for push operations to access - it is only safe to remove the
+ * inode from the buffer once it has been removed from the AIL.
+ *
+ * We also clear the failed bit before removing the item from the AIL
+ * as xfs_trans_ail_delete()->xfs_clear_li_failed() will release buffer
+ * references the inode item owns and needs to hold until we've fully
+ * aborted the inode log item and detached it from the buffer.
+ */
+ clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags);
+ xfs_trans_ail_delete(&iip->ili_item, 0);
+
+ /*
+ * Grab the inode buffer so can we release the reference the inode log
+ * item holds on it.
+ */
+ spin_lock(&iip->ili_lock);
+ bp = iip->ili_item.li_buf;
+ xfs_iflush_abort_clean(iip);
+ spin_unlock(&iip->ili_lock);
+
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
+ if (bp)
+ xfs_buf_rele(bp);
+}
+/*
+ * Abort an inode flush in the case of a shutdown filesystem. This can be called
+ * from anywhere with just an inode reference and does not require holding the
+ * inode cluster buffer locked. If the inode is attached to a cluster buffer,
+ * it will grab and lock it safely, then abort the inode flush.
+ */
+void
+xfs_iflush_shutdown_abort(
+ struct xfs_inode *ip)
+{
+ struct xfs_inode_log_item *iip = ip->i_itemp;
+ struct xfs_buf *bp;
+
+ if (!iip) {
+ /* clean inode, nothing to do */
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
+ return;
+ }
+
+ spin_lock(&iip->ili_lock);
+ bp = iip->ili_item.li_buf;
+ if (!bp) {
+ spin_unlock(&iip->ili_lock);
+ xfs_iflush_abort(ip);
+ return;
+ }
+
+ /*
+ * We have to take a reference to the buffer so that it doesn't get
+ * freed when we drop the ili_lock and then wait to lock the buffer.
+ * We'll clean up the extra reference after we pick up the ili_lock
+ * again.
+ */
+ xfs_buf_hold(bp);
+ spin_unlock(&iip->ili_lock);
+ xfs_buf_lock(bp);
+
+ spin_lock(&iip->ili_lock);
+ if (!iip->ili_item.li_buf) {
/*
- * Clear the inode logging fields so no more flushes are
- * attempted.
+ * Raced with another removal, hold the only reference
+ * to bp now. Inode should not be in the AIL now, so just clean
+ * up and return;
*/
- spin_lock(&iip->ili_lock);
- iip->ili_last_fields = 0;
- iip->ili_fields = 0;
- iip->ili_fsync_fields = 0;
- iip->ili_flush_lsn = 0;
- bp = iip->ili_item.li_buf;
- iip->ili_item.li_buf = NULL;
- list_del_init(&iip->ili_item.li_bio_list);
+ ASSERT(list_empty(&iip->ili_item.li_bio_list));
+ ASSERT(!test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags));
+ xfs_iflush_abort_clean(iip);
spin_unlock(&iip->ili_lock);
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
+ xfs_buf_relse(bp);
+ return;
}
- xfs_iflags_clear(ip, XFS_IFLUSHING);
- if (bp)
- xfs_buf_rele(bp);
+
+ /*
+ * Got two references to bp. The first will get dropped by
+ * xfs_iflush_abort() when the item is removed from the buffer list, but
+ * we can't drop our reference until _abort() returns because we have to
+ * unlock the buffer as well. Hence we abort and then unlock and release
+ * our reference to the buffer.
+ */
+ ASSERT(iip->ili_item.li_buf == bp);
+ spin_unlock(&iip->ili_lock);
+ xfs_iflush_abort(ip);
+ xfs_buf_relse(bp);
}
+
/*
* convert an xfs_inode_log_format struct from the old 32 bit version
* (which can have different field alignments) to the native 64 bit version
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 1a302000d604..bbd836a44ff0 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -44,6 +44,7 @@ static inline int xfs_inode_clean(struct xfs_inode *ip)
extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
extern void xfs_inode_item_destroy(struct xfs_inode *);
extern void xfs_iflush_abort(struct xfs_inode *);
+extern void xfs_iflush_shutdown_abort(struct xfs_inode *);
extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
struct xfs_inode_log_format *);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 2515fe8299e1..83481005317a 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1189,7 +1189,7 @@ xfs_ioctl_setattr_get_trans(
goto out_error;
error = xfs_trans_alloc_ichange(ip, NULL, NULL, pdqp,
- capable(CAP_FOWNER), &tp);
+ has_capability_noaudit(current, CAP_FOWNER), &tp);
if (error)
goto out_error;
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 004ed2a251e8..ca25ed89b706 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -217,7 +217,7 @@ xfs_compat_ioc_fsbulkstat(
inumbers_fmt_pf inumbers_func = xfs_fsinumbers_fmt_compat;
bulkstat_one_fmt_pf bs_one_func = xfs_fsbulkstat_one_fmt_compat;
-#ifdef CONFIG_X86_X32
+#ifdef CONFIG_X86_X32_ABI
if (in_x32_syscall()) {
/*
* ... but on x32 the input xfs_fsop_bulkreq has pointers
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index b79b3846e71b..b34e8e4344a8 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -613,37 +613,6 @@ xfs_vn_getattr(
return 0;
}
-static void
-xfs_setattr_mode(
- struct xfs_inode *ip,
- struct iattr *iattr)
-{
- struct inode *inode = VFS_I(ip);
- umode_t mode = iattr->ia_mode;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
- inode->i_mode &= S_IFMT;
- inode->i_mode |= mode & ~S_IFMT;
-}
-
-void
-xfs_setattr_time(
- struct xfs_inode *ip,
- struct iattr *iattr)
-{
- struct inode *inode = VFS_I(ip);
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
- if (iattr->ia_valid & ATTR_ATIME)
- inode->i_atime = iattr->ia_atime;
- if (iattr->ia_valid & ATTR_CTIME)
- inode->i_ctime = iattr->ia_ctime;
- if (iattr->ia_valid & ATTR_MTIME)
- inode->i_mtime = iattr->ia_mtime;
-}
-
static int
xfs_vn_change_ok(
struct user_namespace *mnt_userns,
@@ -678,10 +647,10 @@ xfs_setattr_nonsize(
int mask = iattr->ia_valid;
xfs_trans_t *tp;
int error;
- kuid_t uid = GLOBAL_ROOT_UID, iuid = GLOBAL_ROOT_UID;
- kgid_t gid = GLOBAL_ROOT_GID, igid = GLOBAL_ROOT_GID;
+ kuid_t uid = GLOBAL_ROOT_UID;
+ kgid_t gid = GLOBAL_ROOT_GID;
struct xfs_dquot *udqp = NULL, *gdqp = NULL;
- struct xfs_dquot *olddquot1 = NULL, *olddquot2 = NULL;
+ struct xfs_dquot *old_udqp = NULL, *old_gdqp = NULL;
ASSERT((mask & ATTR_SIZE) == 0);
@@ -723,66 +692,30 @@ xfs_setattr_nonsize(
}
error = xfs_trans_alloc_ichange(ip, udqp, gdqp, NULL,
- capable(CAP_FOWNER), &tp);
+ has_capability_noaudit(current, CAP_FOWNER), &tp);
if (error)
goto out_dqrele;
/*
- * Change file ownership. Must be the owner or privileged.
+ * Register quota modifications in the transaction. Must be the owner
+ * or privileged. These IDs could have changed since we last looked at
+ * them. But, we're assured that if the ownership did change while we
+ * didn't have the inode locked, inode's dquot(s) would have changed
+ * also.
*/
- if (mask & (ATTR_UID|ATTR_GID)) {
- /*
- * These IDs could have changed since we last looked at them.
- * But, we're assured that if the ownership did change
- * while we didn't have the inode locked, inode's dquot(s)
- * would have changed also.
- */
- iuid = inode->i_uid;
- igid = inode->i_gid;
- gid = (mask & ATTR_GID) ? iattr->ia_gid : igid;
- uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
-
- /*
- * CAP_FSETID overrides the following restrictions:
- *
- * The set-user-ID and set-group-ID bits of a file will be
- * cleared upon successful return from chown()
- */
- if ((inode->i_mode & (S_ISUID|S_ISGID)) &&
- !capable(CAP_FSETID))
- inode->i_mode &= ~(S_ISUID|S_ISGID);
-
- /*
- * Change the ownerships and register quota modifications
- * in the transaction.
- */
- if (!uid_eq(iuid, uid)) {
- if (XFS_IS_UQUOTA_ON(mp)) {
- ASSERT(mask & ATTR_UID);
- ASSERT(udqp);
- olddquot1 = xfs_qm_vop_chown(tp, ip,
- &ip->i_udquot, udqp);
- }
- inode->i_uid = uid;
- }
- if (!gid_eq(igid, gid)) {
- if (XFS_IS_GQUOTA_ON(mp)) {
- ASSERT(xfs_has_pquotino(mp) ||
- !XFS_IS_PQUOTA_ON(mp));
- ASSERT(mask & ATTR_GID);
- ASSERT(gdqp);
- olddquot2 = xfs_qm_vop_chown(tp, ip,
- &ip->i_gdquot, gdqp);
- }
- inode->i_gid = gid;
- }
+ if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp) &&
+ !uid_eq(inode->i_uid, iattr->ia_uid)) {
+ ASSERT(udqp);
+ old_udqp = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp);
+ }
+ if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp) &&
+ !gid_eq(inode->i_gid, iattr->ia_gid)) {
+ ASSERT(xfs_has_pquotino(mp) || !XFS_IS_PQUOTA_ON(mp));
+ ASSERT(gdqp);
+ old_gdqp = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp);
}
- if (mask & ATTR_MODE)
- xfs_setattr_mode(ip, iattr);
- if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
- xfs_setattr_time(ip, iattr);
-
+ setattr_copy(mnt_userns, inode, iattr);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
XFS_STATS_INC(mp, xs_ig_attrchg);
@@ -794,8 +727,8 @@ xfs_setattr_nonsize(
/*
* Release any dquot(s) the inode had kept before chown.
*/
- xfs_qm_dqrele(olddquot1);
- xfs_qm_dqrele(olddquot2);
+ xfs_qm_dqrele(old_udqp);
+ xfs_qm_dqrele(old_gdqp);
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
@@ -1006,11 +939,8 @@ xfs_setattr_size(
xfs_inode_clear_eofblocks_tag(ip);
}
- if (iattr->ia_valid & ATTR_MODE)
- xfs_setattr_mode(ip, iattr);
- if (iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
- xfs_setattr_time(ip, iattr);
-
+ ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID)));
+ setattr_copy(mnt_userns, inode, iattr);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
XFS_STATS_INC(mp, xs_ig_attrchg);
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 09a8fba84ff9..cb9105d667db 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -197,8 +197,6 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y)
int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count,
char *data, unsigned int op);
-void xfs_flush_bdev_async(struct bio *bio, struct block_device *bdev,
- struct completion *done);
#define ASSERT_ALWAYS(expr) \
(likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__))
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 89fec9a18c34..499e15b24215 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -487,7 +487,10 @@ out_error:
* Run all the pending iclog callbacks and wake log force waiters and iclog
* space waiters so they can process the newly set shutdown state. We really
* don't care what order we process callbacks here because the log is shut down
- * and so state cannot change on disk anymore.
+ * and so state cannot change on disk anymore. However, we cannot wake waiters
+ * until the callbacks have been processed because we may be in unmount and
+ * we must ensure that all AIL operations the callbacks perform have completed
+ * before we tear down the AIL.
*
* We avoid processing actively referenced iclogs so that we don't run callbacks
* while the iclog owner might still be preparing the iclog for IO submssion.
@@ -501,7 +504,6 @@ xlog_state_shutdown_callbacks(
struct xlog_in_core *iclog;
LIST_HEAD(cb_list);
- spin_lock(&log->l_icloglock);
iclog = log->l_iclog;
do {
if (atomic_read(&iclog->ic_refcnt)) {
@@ -509,26 +511,22 @@ xlog_state_shutdown_callbacks(
continue;
}
list_splice_init(&iclog->ic_callbacks, &cb_list);
+ spin_unlock(&log->l_icloglock);
+
+ xlog_cil_process_committed(&cb_list);
+
+ spin_lock(&log->l_icloglock);
wake_up_all(&iclog->ic_write_wait);
wake_up_all(&iclog->ic_force_wait);
} while ((iclog = iclog->ic_next) != log->l_iclog);
wake_up_all(&log->l_flush_wait);
- spin_unlock(&log->l_icloglock);
-
- xlog_cil_process_committed(&cb_list);
}
/*
* Flush iclog to disk if this is the last reference to the given iclog and the
* it is in the WANT_SYNC state.
*
- * If the caller passes in a non-zero @old_tail_lsn and the current log tail
- * does not match, there may be metadata on disk that must be persisted before
- * this iclog is written. To satisfy that requirement, set the
- * XLOG_ICL_NEED_FLUSH flag as a condition for writing this iclog with the new
- * log tail value.
- *
* If XLOG_ICL_NEED_FUA is already set on the iclog, we need to ensure that the
* log tail is updated correctly. NEED_FUA indicates that the iclog will be
* written to stable storage, and implies that a commit record is contained
@@ -545,12 +543,10 @@ xlog_state_shutdown_callbacks(
* always capture the tail lsn on the iclog on the first NEED_FUA release
* regardless of the number of active reference counts on this iclog.
*/
-
int
xlog_state_release_iclog(
struct xlog *log,
- struct xlog_in_core *iclog,
- xfs_lsn_t old_tail_lsn)
+ struct xlog_in_core *iclog)
{
xfs_lsn_t tail_lsn;
bool last_ref;
@@ -561,18 +557,14 @@ xlog_state_release_iclog(
/*
* Grabbing the current log tail needs to be atomic w.r.t. the writing
* of the tail LSN into the iclog so we guarantee that the log tail does
- * not move between deciding if a cache flush is required and writing
- * the LSN into the iclog below.
+ * not move between the first time we know that the iclog needs to be
+ * made stable and when we eventually submit it.
*/
- if (old_tail_lsn || iclog->ic_state == XLOG_STATE_WANT_SYNC) {
+ if ((iclog->ic_state == XLOG_STATE_WANT_SYNC ||
+ (iclog->ic_flags & XLOG_ICL_NEED_FUA)) &&
+ !iclog->ic_header.h_tail_lsn) {
tail_lsn = xlog_assign_tail_lsn(log->l_mp);
-
- if (old_tail_lsn && tail_lsn != old_tail_lsn)
- iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
-
- if ((iclog->ic_flags & XLOG_ICL_NEED_FUA) &&
- !iclog->ic_header.h_tail_lsn)
- iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
+ iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
}
last_ref = atomic_dec_and_test(&iclog->ic_refcnt);
@@ -583,11 +575,8 @@ xlog_state_release_iclog(
* pending iclog callbacks that were waiting on the release of
* this iclog.
*/
- if (last_ref) {
- spin_unlock(&log->l_icloglock);
+ if (last_ref)
xlog_state_shutdown_callbacks(log);
- spin_lock(&log->l_icloglock);
- }
return -EIO;
}
@@ -600,8 +589,6 @@ xlog_state_release_iclog(
}
iclog->ic_state = XLOG_STATE_SYNCING;
- if (!iclog->ic_header.h_tail_lsn)
- iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
xlog_verify_tail_lsn(log, iclog);
trace_xlog_iclog_syncing(iclog, _RET_IP_);
@@ -812,10 +799,9 @@ xfs_log_mount_finish(
* mount failure occurs.
*/
mp->m_super->s_flags |= SB_ACTIVE;
+ xfs_log_work_queue(mp);
if (xlog_recovery_needed(log))
error = xlog_recover_finish(log);
- if (!error)
- xfs_log_work_queue(mp);
mp->m_super->s_flags &= ~SB_ACTIVE;
evict_inodes(mp->m_super);
@@ -874,7 +860,7 @@ xlog_force_iclog(
iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA;
if (iclog->ic_state == XLOG_STATE_ACTIVE)
xlog_state_switch_iclogs(iclog->ic_log, iclog, 0);
- return xlog_state_release_iclog(iclog->ic_log, iclog, 0);
+ return xlog_state_release_iclog(iclog->ic_log, iclog);
}
/*
@@ -1102,7 +1088,7 @@ xfs_log_item_init(
int type,
const struct xfs_item_ops *ops)
{
- item->li_mountp = mp;
+ item->li_log = mp->m_log;
item->li_ailp = mp->m_ail;
item->li_type = type;
item->li_ops = ops;
@@ -1374,7 +1360,7 @@ xlog_ioend_work(
*/
if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) {
xfs_alert(log->l_mp, "log I/O error %d", error);
- xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
+ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
}
xlog_state_done_syncing(iclog);
@@ -1883,19 +1869,19 @@ xlog_write_iclog(
return;
}
- bio_init(&iclog->ic_bio, iclog->ic_bvec, howmany(count, PAGE_SIZE));
- bio_set_dev(&iclog->ic_bio, log->l_targ->bt_bdev);
- iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno;
- iclog->ic_bio.bi_end_io = xlog_bio_end_io;
- iclog->ic_bio.bi_private = iclog;
-
/*
* We use REQ_SYNC | REQ_IDLE here to tell the block layer the are more
* IOs coming immediately after this one. This prevents the block layer
* writeback throttle from throttling log writes behind background
* metadata writeback and causing priority inversions.
*/
- iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_IDLE;
+ bio_init(&iclog->ic_bio, log->l_targ->bt_bdev, iclog->ic_bvec,
+ howmany(count, PAGE_SIZE),
+ REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_IDLE);
+ iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno;
+ iclog->ic_bio.bi_end_io = xlog_bio_end_io;
+ iclog->ic_bio.bi_private = iclog;
+
if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH) {
iclog->ic_bio.bi_opf |= REQ_PREFLUSH;
/*
@@ -1913,7 +1899,7 @@ xlog_write_iclog(
iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA);
if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) {
- xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
+ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
return;
}
if (is_vmalloc_addr(iclog->ic_data))
@@ -2412,7 +2398,7 @@ xlog_write_copy_finish(
ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC ||
xlog_is_shutdown(log));
release_iclog:
- error = xlog_state_release_iclog(log, iclog, 0);
+ error = xlog_state_release_iclog(log, iclog);
spin_unlock(&log->l_icloglock);
return error;
}
@@ -2488,7 +2474,7 @@ xlog_write(
xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
"ctx ticket reservation ran out. Need to up reservation");
xlog_print_tic_res(log->l_mp, ticket);
- xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
+ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
}
len = xlog_write_calc_vec_length(ticket, log_vector, optype);
@@ -2629,7 +2615,7 @@ next_lv:
spin_lock(&log->l_icloglock);
xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
- error = xlog_state_release_iclog(log, iclog, 0);
+ error = xlog_state_release_iclog(log, iclog);
spin_unlock(&log->l_icloglock);
return error;
@@ -3053,7 +3039,7 @@ restart:
* reference to the iclog.
*/
if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1))
- error = xlog_state_release_iclog(log, iclog, 0);
+ error = xlog_state_release_iclog(log, iclog);
spin_unlock(&log->l_icloglock);
if (error)
return error;
@@ -3822,9 +3808,10 @@ xlog_verify_iclog(
#endif
/*
- * Perform a forced shutdown on the log. This should be called once and once
- * only by the high level filesystem shutdown code to shut the log subsystem
- * down cleanly.
+ * Perform a forced shutdown on the log.
+ *
+ * This can be called from low level log code to trigger a shutdown, or from the
+ * high level mount shutdown code when the mount shuts down.
*
* Our main objectives here are to make sure that:
* a. if the shutdown was not due to a log IO error, flush the logs to
@@ -3833,6 +3820,8 @@ xlog_verify_iclog(
* parties to find out. Nothing new gets queued after this is done.
* c. Tasks sleeping on log reservations, pinned objects and
* other resources get woken up.
+ * d. The mount is also marked as shut down so that log triggered shutdowns
+ * still behave the same as if they called xfs_forced_shutdown().
*
* Return true if the shutdown cause was a log IO error and we actually shut the
* log down.
@@ -3844,25 +3833,25 @@ xlog_force_shutdown(
{
bool log_error = (shutdown_flags & SHUTDOWN_LOG_IO_ERROR);
- /*
- * If this happens during log recovery then we aren't using the runtime
- * log mechanisms yet so there's nothing to shut down.
- */
- if (!log || xlog_in_recovery(log))
+ if (!log)
return false;
- ASSERT(!xlog_is_shutdown(log));
-
/*
* Flush all the completed transactions to disk before marking the log
* being shut down. We need to do this first as shutting down the log
* before the force will prevent the log force from flushing the iclogs
* to disk.
*
- * Re-entry due to a log IO error shutdown during the log force is
- * prevented by the atomicity of higher level shutdown code.
+ * When we are in recovery, there are no transactions to flush, and
+ * we don't want to touch the log because we don't want to perturb the
+ * current head/tail for future recovery attempts. Hence we need to
+ * avoid a log force in this case.
+ *
+ * If we are shutting down due to a log IO error, then we must avoid
+ * trying to write the log as that may just result in more IO errors and
+ * an endless shutdown/force loop.
*/
- if (!log_error)
+ if (!log_error && !xlog_in_recovery(log))
xfs_log_force(log->l_mp, XFS_LOG_SYNC);
/*
@@ -3879,12 +3868,25 @@ xlog_force_shutdown(
spin_lock(&log->l_icloglock);
if (test_and_set_bit(XLOG_IO_ERROR, &log->l_opstate)) {
spin_unlock(&log->l_icloglock);
- ASSERT(0);
return false;
}
spin_unlock(&log->l_icloglock);
/*
+ * If this log shutdown also sets the mount shutdown state, issue a
+ * shutdown warning message.
+ */
+ if (!test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &log->l_mp->m_opstate)) {
+ xfs_alert_tag(log->l_mp, XFS_PTAG_SHUTDOWN_LOGERROR,
+"Filesystem has been shut down due to log error (0x%x).",
+ shutdown_flags);
+ xfs_alert(log->l_mp,
+"Please unmount the filesystem and rectify the problem(s).");
+ if (xfs_error_level >= XFS_ERRLEVEL_HIGH)
+ xfs_stack_trace();
+ }
+
+ /*
* We don't want anybody waiting for log reservations after this. That
* means we have to wake up everybody queued up on reserveq as well as
* writeq. In addition, we make sure in xlog_{re}grant_log_space that
@@ -3904,8 +3906,12 @@ xlog_force_shutdown(
wake_up_all(&log->l_cilp->xc_start_wait);
wake_up_all(&log->l_cilp->xc_commit_wait);
spin_unlock(&log->l_cilp->xc_push_lock);
+
+ spin_lock(&log->l_icloglock);
xlog_state_shutdown_callbacks(log);
+ spin_unlock(&log->l_icloglock);
+ wake_up_var(&log->l_opstate);
return log_error;
}
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 83a039762b81..ba57323bfdce 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -540,7 +540,7 @@ xlog_cil_insert_items(
spin_unlock(&cil->xc_cil_lock);
if (tp->t_ticket->t_curr_res < 0)
- xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
+ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
}
static void
@@ -705,11 +705,21 @@ xlog_cil_set_ctx_write_state(
* The LSN we need to pass to the log items on transaction
* commit is the LSN reported by the first log vector write, not
* the commit lsn. If we use the commit record lsn then we can
- * move the tail beyond the grant write head.
+ * move the grant write head beyond the tail LSN and overwrite
+ * it.
*/
ctx->start_lsn = lsn;
wake_up_all(&cil->xc_start_wait);
spin_unlock(&cil->xc_push_lock);
+
+ /*
+ * Make sure the metadata we are about to overwrite in the log
+ * has been flushed to stable storage before this iclog is
+ * issued.
+ */
+ spin_lock(&cil->xc_log->l_icloglock);
+ iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
+ spin_unlock(&cil->xc_log->l_icloglock);
return;
}
@@ -854,7 +864,7 @@ xlog_cil_write_commit_record(
error = xlog_write(log, ctx, &vec, ctx->ticket, XLOG_COMMIT_TRANS);
if (error)
- xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
+ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
return error;
}
@@ -888,10 +898,7 @@ xlog_cil_push_work(
struct xfs_trans_header thdr;
struct xfs_log_iovec lhdr;
struct xfs_log_vec lvhdr = { NULL };
- xfs_lsn_t preflush_tail_lsn;
xfs_csn_t push_seq;
- struct bio bio;
- DECLARE_COMPLETION_ONSTACK(bdev_flush);
bool push_commit_stable;
new_ctx = xlog_cil_ctx_alloc();
@@ -962,23 +969,6 @@ xlog_cil_push_work(
spin_unlock(&cil->xc_push_lock);
/*
- * The CIL is stable at this point - nothing new will be added to it
- * because we hold the flush lock exclusively. Hence we can now issue
- * a cache flush to ensure all the completed metadata in the journal we
- * are about to overwrite is on stable storage.
- *
- * Because we are issuing this cache flush before we've written the
- * tail lsn to the iclog, we can have metadata IO completions move the
- * tail forwards between the completion of this flush and the iclog
- * being written. In this case, we need to re-issue the cache flush
- * before the iclog write. To detect whether the log tail moves, sample
- * the tail LSN *before* we issue the flush.
- */
- preflush_tail_lsn = atomic64_read(&log->l_tail_lsn);
- xfs_flush_bdev_async(&bio, log->l_mp->m_ddev_targp->bt_bdev,
- &bdev_flush);
-
- /*
* Pull all the log vectors off the items in the CIL, and remove the
* items from the CIL. We don't need the CIL lock here because it's only
* needed on the transaction commit side which is currently locked out
@@ -1054,12 +1044,6 @@ xlog_cil_push_work(
lvhdr.lv_iovecp = &lhdr;
lvhdr.lv_next = ctx->lv_chain;
- /*
- * Before we format and submit the first iclog, we have to ensure that
- * the metadata writeback ordering cache flush is complete.
- */
- wait_for_completion(&bdev_flush);
-
error = xlog_cil_write_chain(ctx, &lvhdr);
if (error)
goto out_abort_free_ticket;
@@ -1118,7 +1102,7 @@ xlog_cil_push_work(
if (push_commit_stable &&
ctx->commit_iclog->ic_state == XLOG_STATE_ACTIVE)
xlog_state_switch_iclogs(log, ctx->commit_iclog, 0);
- xlog_state_release_iclog(log, ctx->commit_iclog, preflush_tail_lsn);
+ xlog_state_release_iclog(log, ctx->commit_iclog);
/* Not safe to reference ctx now! */
@@ -1139,7 +1123,7 @@ out_abort_free_ticket:
return;
}
spin_lock(&log->l_icloglock);
- xlog_state_release_iclog(log, ctx->commit_iclog, 0);
+ xlog_state_release_iclog(log, ctx->commit_iclog);
/* Not safe to reference ctx now! */
spin_unlock(&log->l_icloglock);
}
@@ -1243,18 +1227,27 @@ xlog_cil_push_now(
if (!async)
flush_workqueue(cil->xc_push_wq);
+ spin_lock(&cil->xc_push_lock);
+
+ /*
+ * If this is an async flush request, we always need to set the
+ * xc_push_commit_stable flag even if something else has already queued
+ * a push. The flush caller is asking for the CIL to be on stable
+ * storage when the next push completes, so regardless of who has queued
+ * the push, the flush requires stable semantics from it.
+ */
+ cil->xc_push_commit_stable = async;
+
/*
* If the CIL is empty or we've already pushed the sequence then
- * there's no work we need to do.
+ * there's no more work that we need to do.
*/
- spin_lock(&cil->xc_push_lock);
if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) {
spin_unlock(&cil->xc_push_lock);
return;
}
cil->xc_push_seq = push_seq;
- cil->xc_push_commit_stable = async;
queue_work(cil->xc_push_wq, &cil->xc_ctx->push_work);
spin_unlock(&cil->xc_push_lock);
}
@@ -1352,6 +1345,13 @@ xlog_cil_flush(
trace_xfs_log_force(log->l_mp, seq, _RET_IP_);
xlog_cil_push_now(log, seq, true);
+
+ /*
+ * If the CIL is empty, make sure that any previous checkpoint that may
+ * still be in an active iclog is pushed to stable storage.
+ */
+ if (list_empty(&log->l_cilp->xc_cil))
+ xfs_log_force(log->l_mp, 0);
}
/*
@@ -1468,7 +1468,7 @@ bool
xfs_log_item_in_current_chkpt(
struct xfs_log_item *lip)
{
- struct xfs_cil *cil = lip->li_mountp->m_log->l_cilp;
+ struct xfs_cil *cil = lip->li_log->l_cilp;
if (list_empty(&lip->li_cil))
return false;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 23103d68423c..401cdc400980 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -484,6 +484,17 @@ xlog_is_shutdown(struct xlog *log)
return test_bit(XLOG_IO_ERROR, &log->l_opstate);
}
+/*
+ * Wait until the xlog_force_shutdown() has marked the log as shut down
+ * so xlog_is_shutdown() will always return true.
+ */
+static inline void
+xlog_shutdown_wait(
+ struct xlog *log)
+{
+ wait_var_event(&log->l_opstate, xlog_is_shutdown(log));
+}
+
/* common routines */
extern int
xlog_recover(
@@ -524,8 +535,7 @@ void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket);
void xlog_state_switch_iclogs(struct xlog *log, struct xlog_in_core *iclog,
int eventual_size);
-int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog,
- xfs_lsn_t log_tail_lsn);
+int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog);
/*
* When we crack an atomic LSN, we sample it first so that the value will not
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 96c997ed2ec8..c4ad4296c540 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2485,7 +2485,7 @@ xlog_finish_defer_ops(
error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres,
dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp);
if (error) {
- xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+ xlog_force_shutdown(mp->m_log, SHUTDOWN_LOG_IO_ERROR);
return error;
}
@@ -2519,21 +2519,22 @@ xlog_abort_defer_ops(
xfs_defer_ops_capture_free(mp, dfc);
}
}
+
/*
* When this is called, all of the log intent items which did not have
- * corresponding log done items should be in the AIL. What we do now
- * is update the data structures associated with each one.
+ * corresponding log done items should be in the AIL. What we do now is update
+ * the data structures associated with each one.
*
- * Since we process the log intent items in normal transactions, they
- * will be removed at some point after the commit. This prevents us
- * from just walking down the list processing each one. We'll use a
- * flag in the intent item to skip those that we've already processed
- * and use the AIL iteration mechanism's generation count to try to
- * speed this up at least a bit.
+ * Since we process the log intent items in normal transactions, they will be
+ * removed at some point after the commit. This prevents us from just walking
+ * down the list processing each one. We'll use a flag in the intent item to
+ * skip those that we've already processed and use the AIL iteration mechanism's
+ * generation count to try to speed this up at least a bit.
*
- * When we start, we know that the intents are the only things in the
- * AIL. As we process them, however, other items are added to the
- * AIL.
+ * When we start, we know that the intents are the only things in the AIL. As we
+ * process them, however, other items are added to the AIL. Hence we know we
+ * have started recovery on all the pending intents when we find an non-intent
+ * item in the AIL.
*/
STATIC int
xlog_recover_process_intents(
@@ -2556,17 +2557,8 @@ xlog_recover_process_intents(
for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
lip != NULL;
lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
- /*
- * We're done when we see something other than an intent.
- * There should be no intents left in the AIL now.
- */
- if (!xlog_item_is_intent(lip)) {
-#ifdef DEBUG
- for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
- ASSERT(!xlog_item_is_intent(lip));
-#endif
+ if (!xlog_item_is_intent(lip))
break;
- }
/*
* We should never see a redo item with a LSN higher than
@@ -2607,8 +2599,9 @@ err:
}
/*
- * A cancel occurs when the mount has failed and we're bailing out.
- * Release all pending log intent items so they don't pin the AIL.
+ * A cancel occurs when the mount has failed and we're bailing out. Release all
+ * pending log intent items that we haven't started recovery on so they don't
+ * pin the AIL.
*/
STATIC void
xlog_recover_cancel_intents(
@@ -2622,17 +2615,8 @@ xlog_recover_cancel_intents(
spin_lock(&ailp->ail_lock);
lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
while (lip != NULL) {
- /*
- * We're done when we see something other than an intent.
- * There should be no intents left in the AIL now.
- */
- if (!xlog_item_is_intent(lip)) {
-#ifdef DEBUG
- for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
- ASSERT(!xlog_item_is_intent(lip));
-#endif
+ if (!xlog_item_is_intent(lip))
break;
- }
spin_unlock(&ailp->ail_lock);
lip->li_ops->iop_release(lip);
@@ -3470,7 +3454,7 @@ xlog_recover_finish(
*/
xlog_recover_cancel_intents(log);
xfs_alert(log->l_mp, "Failed to recover intents");
- xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
+ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
return error;
}
@@ -3517,7 +3501,7 @@ xlog_recover_finish(
* end of intents processing can be pushed through the CIL
* and AIL.
*/
- xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
+ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
}
return 0;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index bed73e8002a5..c5f153c3693f 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -21,6 +21,7 @@
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_log.h"
+#include "xfs_log_priv.h"
#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_fsops.h"
@@ -1146,7 +1147,7 @@ xfs_mod_fdblocks(
* problems (i.e. transaction abort, pagecache discards, etc.) than
* slightly premature -ENOSPC.
*/
- set_aside = mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
+ set_aside = xfs_fdblocks_unavailable(mp);
percpu_counter_add_batch(&mp->m_fdblocks, delta, batch);
if (__percpu_counter_compare(&mp->m_fdblocks, set_aside,
XFS_FDBLOCKS_BATCH) >= 0) {
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 00720a02e761..f6dc19de8322 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -479,6 +479,21 @@ extern void xfs_unmountfs(xfs_mount_t *);
*/
#define XFS_FDBLOCKS_BATCH 1024
+/*
+ * Estimate the amount of free space that is not available to userspace and is
+ * not explicitly reserved from the incore fdblocks. This includes:
+ *
+ * - The minimum number of blocks needed to support splitting a bmap btree
+ * - The blocks currently in use by the freespace btrees because they record
+ * the actual blocks that will fill per-AG metadata space reservations
+ */
+static inline uint64_t
+xfs_fdblocks_unavailable(
+ struct xfs_mount *mp)
+{
+ return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
+}
+
extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
bool reserved);
extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 4abe17312c2b..37a24f0f7cd4 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -319,7 +319,8 @@ xfs_fs_commit_blocks(
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- xfs_setattr_time(ip, iattr);
+ ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID)));
+ setattr_copy(&init_user_ns, inode, iattr);
if (update_isize) {
i_size_write(inode, iattr->ia_size);
ip->i_disk_size = iattr->ia_size;
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 32ac8d9c8940..f165d1a3de1d 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -25,6 +25,7 @@
#include "xfs_error.h"
#include "xfs_ag.h"
#include "xfs_ialloc.h"
+#include "xfs_log_priv.h"
/*
* The global quota manager. There is only one of these for the entire
@@ -121,8 +122,7 @@ xfs_qm_dqpurge(
struct xfs_dquot *dqp,
void *data)
{
- struct xfs_mount *mp = dqp->q_mount;
- struct xfs_quotainfo *qi = mp->m_quotainfo;
+ struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo;
int error = -EAGAIN;
xfs_dqlock(dqp);
@@ -157,7 +157,7 @@ xfs_qm_dqpurge(
}
ASSERT(atomic_read(&dqp->q_pincount) == 0);
- ASSERT(xfs_is_shutdown(mp) ||
+ ASSERT(xlog_is_shutdown(dqp->q_logitem.qli_item.li_log) ||
!test_bit(XFS_LI_IN_AIL, &dqp->q_logitem.qli_item.li_flags));
xfs_dqfunlock(dqp);
@@ -172,7 +172,7 @@ xfs_qm_dqpurge(
*/
ASSERT(!list_empty(&dqp->q_lru));
list_lru_del(&qi->qi_lru, &dqp->q_lru);
- XFS_STATS_DEC(mp, xs_qm_dquot_unused);
+ XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
xfs_qm_dqdestroy(dqp);
return 0;
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index d3da67772d57..0d868c93144d 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -457,7 +457,7 @@ xfs_cui_item_recover(
struct xfs_cud_log_item *cudp;
struct xfs_trans *tp;
struct xfs_btree_cur *rcur = NULL;
- struct xfs_mount *mp = lip->li_mountp;
+ struct xfs_mount *mp = lip->li_log->l_mp;
xfs_fsblock_t new_fsb;
xfs_extlen_t new_len;
unsigned int refc_type;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index db70060e7bf6..54e68e5693fd 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -425,7 +425,10 @@ convert:
if (!convert_now || cmap->br_state == XFS_EXT_NORM)
return 0;
trace_xfs_reflink_convert_cow(ip, cmap);
- return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
+ error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
+ if (!error)
+ cmap->br_state = XFS_EXT_NORM;
+ return error;
out_trans_cancel:
xfs_trans_cancel(tp);
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index c3966b4c58ef..a22b2d19ef91 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -510,7 +510,7 @@ xfs_rui_item_recover(
struct xfs_rud_log_item *rudp;
struct xfs_trans *tp;
struct xfs_btree_cur *rcur = NULL;
- struct xfs_mount *mp = lip->li_mountp;
+ struct xfs_mount *mp = lip->li_log->l_mp;
enum xfs_rmap_intent_type type;
xfs_exntst_t state;
int i;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index d84714e4e46a..54be9d64093e 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -815,7 +815,8 @@ xfs_fs_statfs(
spin_unlock(&mp->m_sb_lock);
/* make sure statp->f_bfree does not underflow */
- statp->f_bfree = max_t(int64_t, fdblocks - mp->m_alloc_set_aside, 0);
+ statp->f_bfree = max_t(int64_t, 0,
+ fdblocks - xfs_fdblocks_unavailable(mp));
statp->f_bavail = statp->f_bfree;
fakeinos = XFS_FSB_TO_INO(mp, statp->f_bfree);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 4a8076ef8cb4..b141ef78c755 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -933,7 +933,7 @@ DEFINE_IREF_EVENT(xfs_inode_unpin);
DEFINE_IREF_EVENT(xfs_inode_unpin_nowait);
DECLARE_EVENT_CLASS(xfs_namespace_class,
- TP_PROTO(struct xfs_inode *dp, struct xfs_name *name),
+ TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name),
TP_ARGS(dp, name),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -956,7 +956,7 @@ DECLARE_EVENT_CLASS(xfs_namespace_class,
#define DEFINE_NAMESPACE_EVENT(name) \
DEFINE_EVENT(xfs_namespace_class, name, \
- TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), \
+ TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name), \
TP_ARGS(dp, name))
DEFINE_NAMESPACE_EVENT(xfs_remove);
DEFINE_NAMESPACE_EVENT(xfs_link);
@@ -1308,7 +1308,7 @@ DECLARE_EVENT_CLASS(xfs_log_item_class,
__field(xfs_lsn_t, lsn)
),
TP_fast_assign(
- __entry->dev = lip->li_mountp->m_super->s_dev;
+ __entry->dev = lip->li_log->l_mp->m_super->s_dev;
__entry->lip = lip;
__entry->type = lip->li_type;
__entry->flags = lip->li_flags;
@@ -1361,7 +1361,7 @@ DECLARE_EVENT_CLASS(xfs_ail_class,
__field(xfs_lsn_t, new_lsn)
),
TP_fast_assign(
- __entry->dev = lip->li_mountp->m_super->s_dev;
+ __entry->dev = lip->li_log->l_mp->m_super->s_dev;
__entry->lip = lip;
__entry->type = lip->li_type;
__entry->flags = lip->li_flags;
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 59e2f9031b9f..0ac717aad380 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -648,7 +648,7 @@ xfs_trans_add_item(
struct xfs_trans *tp,
struct xfs_log_item *lip)
{
- ASSERT(lip->li_mountp == tp->t_mountp);
+ ASSERT(lip->li_log == tp->t_mountp->m_log);
ASSERT(lip->li_ailp == tp->t_mountp->m_ail);
ASSERT(list_empty(&lip->li_trans));
ASSERT(!test_bit(XFS_LI_DIRTY, &lip->li_flags));
@@ -775,7 +775,7 @@ xfs_trans_committed_bulk(
* object into the AIL as we are in a shutdown situation.
*/
if (aborted) {
- ASSERT(xfs_is_shutdown(ailp->ail_mount));
+ ASSERT(xlog_is_shutdown(ailp->ail_log));
if (lip->li_ops->iop_unpin)
lip->li_ops->iop_unpin(lip, 1);
continue;
@@ -836,6 +836,7 @@ __xfs_trans_commit(
bool regrant)
{
struct xfs_mount *mp = tp->t_mountp;
+ struct xlog *log = mp->m_log;
xfs_csn_t commit_seq = 0;
int error = 0;
int sync = tp->t_flags & XFS_TRANS_SYNC;
@@ -864,7 +865,13 @@ __xfs_trans_commit(
if (!(tp->t_flags & XFS_TRANS_DIRTY))
goto out_unreserve;
- if (xfs_is_shutdown(mp)) {
+ /*
+ * We must check against log shutdown here because we cannot abort log
+ * items and leave them dirty, inconsistent and unpinned in memory while
+ * the log is active. This leaves them open to being written back to
+ * disk, and that will lead to on-disk corruption.
+ */
+ if (xlog_is_shutdown(log)) {
error = -EIO;
goto out_unreserve;
}
@@ -878,7 +885,7 @@ __xfs_trans_commit(
xfs_trans_apply_sb_deltas(tp);
xfs_trans_apply_dquot_deltas(tp);
- xlog_cil_commit(mp->m_log, tp, &commit_seq, regrant);
+ xlog_cil_commit(log, tp, &commit_seq, regrant);
xfs_trans_free(tp);
@@ -905,10 +912,10 @@ out_unreserve:
*/
xfs_trans_unreserve_and_mod_dquots(tp);
if (tp->t_ticket) {
- if (regrant && !xlog_is_shutdown(mp->m_log))
- xfs_log_ticket_regrant(mp->m_log, tp->t_ticket);
+ if (regrant && !xlog_is_shutdown(log))
+ xfs_log_ticket_regrant(log, tp->t_ticket);
else
- xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
+ xfs_log_ticket_ungrant(log, tp->t_ticket);
tp->t_ticket = NULL;
}
xfs_trans_free_items(tp, !!error);
@@ -926,18 +933,27 @@ xfs_trans_commit(
}
/*
- * Unlock all of the transaction's items and free the transaction.
- * The transaction must not have modified any of its items, because
- * there is no way to restore them to their previous state.
+ * Unlock all of the transaction's items and free the transaction. If the
+ * transaction is dirty, we must shut down the filesystem because there is no
+ * way to restore them to their previous state.
*
- * If the transaction has made a log reservation, make sure to release
- * it as well.
+ * If the transaction has made a log reservation, make sure to release it as
+ * well.
+ *
+ * This is a high level function (equivalent to xfs_trans_commit()) and so can
+ * be called after the transaction has effectively been aborted due to the mount
+ * being shut down. However, if the mount has not been shut down and the
+ * transaction is dirty we will shut the mount down and, in doing so, that
+ * guarantees that the log is shut down, too. Hence we don't need to be as
+ * careful with shutdown state and dirty items here as we need to be in
+ * xfs_trans_commit().
*/
void
xfs_trans_cancel(
struct xfs_trans *tp)
{
struct xfs_mount *mp = tp->t_mountp;
+ struct xlog *log = mp->m_log;
bool dirty = (tp->t_flags & XFS_TRANS_DIRTY);
trace_xfs_trans_cancel(tp, _RET_IP_);
@@ -955,16 +971,18 @@ xfs_trans_cancel(
}
/*
- * See if the caller is relying on us to shut down the
- * filesystem. This happens in paths where we detect
- * corruption and decide to give up.
+ * See if the caller is relying on us to shut down the filesystem. We
+ * only want an error report if there isn't already a shutdown in
+ * progress, so we only need to check against the mount shutdown state
+ * here.
*/
if (dirty && !xfs_is_shutdown(mp)) {
XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
}
#ifdef DEBUG
- if (!dirty && !xfs_is_shutdown(mp)) {
+ /* Log items need to be consistent until the log is shut down. */
+ if (!dirty && !xlog_is_shutdown(log)) {
struct xfs_log_item *lip;
list_for_each_entry(lip, &tp->t_items, li_trans)
@@ -975,7 +993,7 @@ xfs_trans_cancel(
xfs_trans_unreserve_and_mod_dquots(tp);
if (tp->t_ticket) {
- xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
+ xfs_log_ticket_ungrant(log, tp->t_ticket);
tp->t_ticket = NULL;
}
@@ -1210,3 +1228,89 @@ out_cancel:
xfs_trans_cancel(tp);
return error;
}
+
+/*
+ * Allocate an transaction, lock and join the directory and child inodes to it,
+ * and reserve quota for a directory update. If there isn't sufficient space,
+ * @dblocks will be set to zero for a reservationless directory update and
+ * @nospace_error will be set to a negative errno describing the space
+ * constraint we hit.
+ *
+ * The caller must ensure that the on-disk dquots attached to this inode have
+ * already been allocated and initialized. The ILOCKs will be dropped when the
+ * transaction is committed or cancelled.
+ */
+int
+xfs_trans_alloc_dir(
+ struct xfs_inode *dp,
+ struct xfs_trans_res *resv,
+ struct xfs_inode *ip,
+ unsigned int *dblocks,
+ struct xfs_trans **tpp,
+ int *nospace_error)
+{
+ struct xfs_trans *tp;
+ struct xfs_mount *mp = ip->i_mount;
+ unsigned int resblks;
+ bool retried = false;
+ int error;
+
+retry:
+ *nospace_error = 0;
+ resblks = *dblocks;
+ error = xfs_trans_alloc(mp, resv, resblks, 0, 0, &tp);
+ if (error == -ENOSPC) {
+ *nospace_error = error;
+ resblks = 0;
+ error = xfs_trans_alloc(mp, resv, resblks, 0, 0, &tp);
+ }
+ if (error)
+ return error;
+
+ xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL);
+
+ xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+ error = xfs_qm_dqattach_locked(dp, false);
+ if (error) {
+ /* Caller should have allocated the dquots! */
+ ASSERT(error != -ENOENT);
+ goto out_cancel;
+ }
+
+ error = xfs_qm_dqattach_locked(ip, false);
+ if (error) {
+ /* Caller should have allocated the dquots! */
+ ASSERT(error != -ENOENT);
+ goto out_cancel;
+ }
+
+ if (resblks == 0)
+ goto done;
+
+ error = xfs_trans_reserve_quota_nblks(tp, dp, resblks, 0, false);
+ if (error == -EDQUOT || error == -ENOSPC) {
+ if (!retried) {
+ xfs_trans_cancel(tp);
+ xfs_blockgc_free_quota(dp, 0);
+ retried = true;
+ goto retry;
+ }
+
+ *nospace_error = error;
+ resblks = 0;
+ error = 0;
+ }
+ if (error)
+ goto out_cancel;
+
+done:
+ *tpp = tp;
+ *dblocks = resblks;
+ return 0;
+
+out_cancel:
+ xfs_trans_cancel(tp);
+ return error;
+}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index a487b264a9eb..0c82673238f4 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -8,6 +8,7 @@
/* kernel only transaction subsystem defines */
+struct xlog;
struct xfs_buf;
struct xfs_buftarg;
struct xfs_efd_log_item;
@@ -31,7 +32,7 @@ struct xfs_log_item {
struct list_head li_ail; /* AIL pointers */
struct list_head li_trans; /* transaction list */
xfs_lsn_t li_lsn; /* last on-disk lsn */
- struct xfs_mount *li_mountp; /* ptr to fs mount */
+ struct xlog *li_log;
struct xfs_ail *li_ailp; /* ptr to AIL */
uint li_type; /* item type */
unsigned long li_flags; /* misc flags */
@@ -174,7 +175,7 @@ xfs_trans_get_buf(
struct xfs_buftarg *target,
xfs_daddr_t blkno,
int numblks,
- uint flags,
+ xfs_buf_flags_t flags,
struct xfs_buf **bpp)
{
DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
@@ -259,6 +260,9 @@ int xfs_trans_alloc_icreate(struct xfs_mount *mp, struct xfs_trans_res *resv,
int xfs_trans_alloc_ichange(struct xfs_inode *ip, struct xfs_dquot *udqp,
struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, bool force,
struct xfs_trans **tpp);
+int xfs_trans_alloc_dir(struct xfs_inode *dp, struct xfs_trans_res *resv,
+ struct xfs_inode *ip, unsigned int *dblocks,
+ struct xfs_trans **tpp, int *nospace_error);
static inline void
xfs_trans_set_context(
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 2a8c8dc54c95..d3a97a028560 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -398,7 +398,7 @@ xfsaild_push_item(
* If log item pinning is enabled, skip the push and track the item as
* pinned. This can help induce head-behind-tail conditions.
*/
- if (XFS_TEST_ERROR(false, ailp->ail_mount, XFS_ERRTAG_LOG_ITEM_PIN))
+ if (XFS_TEST_ERROR(false, ailp->ail_log->l_mp, XFS_ERRTAG_LOG_ITEM_PIN))
return XFS_ITEM_PINNED;
/*
@@ -418,7 +418,7 @@ static long
xfsaild_push(
struct xfs_ail *ailp)
{
- xfs_mount_t *mp = ailp->ail_mount;
+ struct xfs_mount *mp = ailp->ail_log->l_mp;
struct xfs_ail_cursor cur;
struct xfs_log_item *lip;
xfs_lsn_t lsn;
@@ -443,15 +443,27 @@ xfsaild_push(
ailp->ail_log_flush = 0;
XFS_STATS_INC(mp, xs_push_ail_flush);
- xlog_cil_flush(mp->m_log);
+ xlog_cil_flush(ailp->ail_log);
}
spin_lock(&ailp->ail_lock);
- /* barrier matches the ail_target update in xfs_ail_push() */
- smp_rmb();
- target = ailp->ail_target;
- ailp->ail_target_prev = target;
+ /*
+ * If we have a sync push waiter, we always have to push till the AIL is
+ * empty. Update the target to point to the end of the AIL so that
+ * capture updates that occur after the sync push waiter has gone to
+ * sleep.
+ */
+ if (waitqueue_active(&ailp->ail_empty)) {
+ lip = xfs_ail_max(ailp);
+ if (lip)
+ target = lip->li_lsn;
+ } else {
+ /* barrier matches the ail_target update in xfs_ail_push() */
+ smp_rmb();
+ target = ailp->ail_target;
+ ailp->ail_target_prev = target;
+ }
/* we're done if the AIL is empty or our push has reached the end */
lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->ail_last_pushed_lsn);
@@ -620,7 +632,7 @@ xfsaild(
* opportunity to release such buffers from the queue.
*/
ASSERT(list_empty(&ailp->ail_buf_list) ||
- xfs_is_shutdown(ailp->ail_mount));
+ xlog_is_shutdown(ailp->ail_log));
xfs_buf_delwri_cancel(&ailp->ail_buf_list);
break;
}
@@ -683,7 +695,7 @@ xfs_ail_push(
struct xfs_log_item *lip;
lip = xfs_ail_min(ailp);
- if (!lip || xfs_is_shutdown(ailp->ail_mount) ||
+ if (!lip || xlog_is_shutdown(ailp->ail_log) ||
XFS_LSN_CMP(threshold_lsn, ailp->ail_target) <= 0)
return;
@@ -724,7 +736,6 @@ xfs_ail_push_all_sync(
spin_lock(&ailp->ail_lock);
while ((lip = xfs_ail_max(ailp)) != NULL) {
prepare_to_wait(&ailp->ail_empty, &wait, TASK_UNINTERRUPTIBLE);
- ailp->ail_target = lip->li_lsn;
wake_up_process(ailp->ail_task);
spin_unlock(&ailp->ail_lock);
schedule();
@@ -740,7 +751,7 @@ xfs_ail_update_finish(
struct xfs_ail *ailp,
xfs_lsn_t old_lsn) __releases(ailp->ail_lock)
{
- struct xfs_mount *mp = ailp->ail_mount;
+ struct xlog *log = ailp->ail_log;
/* if the tail lsn hasn't changed, don't do updates or wakeups. */
if (!old_lsn || old_lsn == __xfs_ail_min_lsn(ailp)) {
@@ -748,13 +759,13 @@ xfs_ail_update_finish(
return;
}
- if (!xfs_is_shutdown(mp))
- xlog_assign_tail_lsn_locked(mp);
+ if (!xlog_is_shutdown(log))
+ xlog_assign_tail_lsn_locked(log->l_mp);
if (list_empty(&ailp->ail_head))
wake_up_all(&ailp->ail_empty);
spin_unlock(&ailp->ail_lock);
- xfs_log_space_wake(mp);
+ xfs_log_space_wake(log->l_mp);
}
/*
@@ -862,17 +873,17 @@ xfs_trans_ail_delete(
int shutdown_type)
{
struct xfs_ail *ailp = lip->li_ailp;
- struct xfs_mount *mp = ailp->ail_mount;
+ struct xlog *log = ailp->ail_log;
xfs_lsn_t tail_lsn;
spin_lock(&ailp->ail_lock);
if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) {
spin_unlock(&ailp->ail_lock);
- if (shutdown_type && !xfs_is_shutdown(mp)) {
- xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
+ if (shutdown_type && !xlog_is_shutdown(log)) {
+ xfs_alert_tag(log->l_mp, XFS_PTAG_AILDELETE,
"%s: attempting to delete a log item that is not in the AIL",
__func__);
- xfs_force_shutdown(mp, shutdown_type);
+ xlog_force_shutdown(log, shutdown_type);
}
return;
}
@@ -893,7 +904,7 @@ xfs_trans_ail_init(
if (!ailp)
return -ENOMEM;
- ailp->ail_mount = mp;
+ ailp->ail_log = mp->m_log;
INIT_LIST_HEAD(&ailp->ail_head);
INIT_LIST_HEAD(&ailp->ail_cursors);
spin_lock_init(&ailp->ail_lock);
@@ -901,7 +912,7 @@ xfs_trans_ail_init(
init_waitqueue_head(&ailp->ail_empty);
ailp->ail_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
- ailp->ail_mount->m_super->s_id);
+ mp->m_super->s_id);
if (IS_ERR(ailp->ail_task))
goto out_free_ailp;
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 3004aeac9110..f0d79a9050ba 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -6,6 +6,7 @@
#ifndef __XFS_TRANS_PRIV_H__
#define __XFS_TRANS_PRIV_H__
+struct xlog;
struct xfs_log_item;
struct xfs_mount;
struct xfs_trans;
@@ -50,7 +51,7 @@ struct xfs_ail_cursor {
* Eventually we need to drive the locking in here as well.
*/
struct xfs_ail {
- struct xfs_mount *ail_mount;
+ struct xlog *ail_log;
struct task_struct *ail_task;
struct list_head ail_head;
xfs_lsn_t ail_target;
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index b76dfb310ab6..e20e7c841489 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -35,6 +35,17 @@ static inline int zonefs_zone_mgmt(struct inode *inode,
lockdep_assert_held(&zi->i_truncate_mutex);
+ /*
+ * With ZNS drives, closing an explicitly open zone that has not been
+ * written will change the zone state to "closed", that is, the zone
+ * will remain active. Since this can then cause failure of explicit
+ * open operation on other zones if the drive active zone resources
+ * are exceeded, make sure that the zone does not remain active by
+ * resetting it.
+ */
+ if (op == REQ_OP_ZONE_CLOSE && !zi->i_wpoffset)
+ op = REQ_OP_ZONE_RESET;
+
trace_zonefs_zone_mgmt(inode, op);
ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector,
zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS);
@@ -185,9 +196,9 @@ static const struct address_space_operations zonefs_file_aops = {
.readahead = zonefs_readahead,
.writepage = zonefs_writepage,
.writepages = zonefs_writepages,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .dirty_folio = filemap_dirty_folio,
.releasepage = iomap_releasepage,
- .invalidatepage = iomap_invalidatepage,
+ .invalidate_folio = iomap_invalidate_folio,
.migratepage = iomap_migrate_page,
.is_partially_uptodate = iomap_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
@@ -692,12 +703,10 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
if (!nr_pages)
return 0;
- bio = bio_alloc(GFP_NOFS, nr_pages);
- bio_set_dev(bio, bdev);
+ bio = bio_alloc(bdev, nr_pages,
+ REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS);
bio->bi_iter.bi_sector = zi->i_zsector;
- bio->bi_write_hint = iocb->ki_hint;
bio->bi_ioprio = iocb->ki_ioprio;
- bio->bi_opf = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;
if (iocb->ki_flags & IOCB_DSYNC)
bio->bi_opf |= REQ_FUA;
@@ -1137,13 +1146,14 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb)
{
struct zonefs_inode_info *zi;
- zi = kmem_cache_alloc(zonefs_inode_cachep, GFP_KERNEL);
+ zi = alloc_inode_sb(sb, zonefs_inode_cachep, GFP_KERNEL);
if (!zi)
return NULL;
inode_init_once(&zi->i_vnode);
mutex_init(&zi->i_truncate_mutex);
zi->i_wr_refcnt = 0;
+ zi->i_flags = 0;
return &zi->i_vnode;
}
@@ -1295,12 +1305,13 @@ static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode,
inc_nlink(parent);
}
-static void zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone,
- enum zonefs_ztype type)
+static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone,
+ enum zonefs_ztype type)
{
struct super_block *sb = inode->i_sb;
struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ int ret = 0;
inode->i_ino = zone->start >> sbi->s_zone_sectors_shift;
inode->i_mode = S_IFREG | sbi->s_perm;
@@ -1325,6 +1336,22 @@ static void zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone,
sb->s_maxbytes = max(zi->i_max_size, sb->s_maxbytes);
sbi->s_blocks += zi->i_max_size >> sb->s_blocksize_bits;
sbi->s_used_blocks += zi->i_wpoffset >> sb->s_blocksize_bits;
+
+ /*
+ * For sequential zones, make sure that any open zone is closed first
+ * to ensure that the initial number of open zones is 0, in sync with
+ * the open zone accounting done when the mount option
+ * ZONEFS_MNTOPT_EXPLICIT_OPEN is used.
+ */
+ if (type == ZONEFS_ZTYPE_SEQ &&
+ (zone->cond == BLK_ZONE_COND_IMP_OPEN ||
+ zone->cond == BLK_ZONE_COND_EXP_OPEN)) {
+ mutex_lock(&zi->i_truncate_mutex);
+ ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
+ mutex_unlock(&zi->i_truncate_mutex);
+ }
+
+ return ret;
}
static struct dentry *zonefs_create_inode(struct dentry *parent,
@@ -1334,6 +1361,7 @@ static struct dentry *zonefs_create_inode(struct dentry *parent,
struct inode *dir = d_inode(parent);
struct dentry *dentry;
struct inode *inode;
+ int ret;
dentry = d_alloc_name(parent, name);
if (!dentry)
@@ -1344,10 +1372,16 @@ static struct dentry *zonefs_create_inode(struct dentry *parent,
goto dput;
inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime;
- if (zone)
- zonefs_init_file_inode(inode, zone, type);
- else
+ if (zone) {
+ ret = zonefs_init_file_inode(inode, zone, type);
+ if (ret) {
+ iput(inode);
+ goto dput;
+ }
+ } else {
zonefs_init_dir_inode(dir, inode, type);
+ }
+
d_add(dentry, inode);
dir->i_size++;
@@ -1541,10 +1575,8 @@ static int zonefs_read_super(struct super_block *sb)
if (!page)
return -ENOMEM;
- bio_init(&bio, &bio_vec, 1);
+ bio_init(&bio, sb->s_bdev, &bio_vec, 1, REQ_OP_READ);
bio.bi_iter.bi_sector = 0;
- bio.bi_opf = REQ_OP_READ;
- bio_set_dev(&bio, sb->s_bdev);
bio_add_page(&bio, page, PAGE_SIZE, 0);
ret = submit_bio_wait(&bio);